[clang-tools-extra] [clang] [llvm] Move ExpandMemCmp and MergeIcmp to the middle end (PR #77370)

Gabriel Baraldi via cfe-commits cfe-commits at lists.llvm.org
Wed Jan 17 09:58:33 PST 2024


https://github.com/gbaraldi updated https://github.com/llvm/llvm-project/pull/77370

>From 4440a91823cf878ea0dec29fb7d511a25f4333c0 Mon Sep 17 00:00:00 2001
From: Gabriel Baraldi <baraldigabriel at gmail.com>
Date: Mon, 8 Jan 2024 17:04:08 -0300
Subject: [PATCH 01/11] Move ExpandMemCmp and MergeIcmp to the middle end to
 allow for better optimization of the inline expansions

---
 .../include/llvm/CodeGen/CodeGenPassBuilder.h |    10 -
 .../llvm/CodeGen/MachinePassRegistry.def      |     2 -
 llvm/include/llvm/CodeGen/Passes.h            |     2 -
 llvm/include/llvm/InitializePasses.h          |     1 -
 llvm/include/llvm/LinkAllPasses.h             |     1 -
 .../Scalar}/ExpandMemCmp.h                    |     6 +-
 llvm/lib/CodeGen/CMakeLists.txt               |     1 -
 llvm/lib/CodeGen/CodeGen.cpp                  |     1 -
 llvm/lib/CodeGen/TargetPassConfig.cpp         |    11 -
 llvm/lib/Passes/PassBuilder.cpp               |     2 +-
 llvm/lib/Passes/PassBuilderPipelines.cpp      |     6 +
 llvm/lib/Passes/PassRegistry.def              |     3 +-
 llvm/lib/Transforms/Scalar/CMakeLists.txt     |     1 +
 .../Scalar}/ExpandMemCmp.cpp                  |   133 +-
 llvm/test/CodeGen/AArch64/O3-pipeline.ll      |     7 -
 .../test/CodeGen/AArch64/bcmp-inline-small.ll |    98 -
 llvm/test/CodeGen/AArch64/bcmp.ll             |   537 -
 .../test/CodeGen/AArch64/dag-combine-setcc.ll |    31 +-
 .../AArch64/machine-licm-hoist-load.ll        |   128 +-
 llvm/test/CodeGen/AArch64/memcmp.ll           |  3029 ---
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll      |    28 -
 llvm/test/CodeGen/ARM/O3-pipeline.ll          |     7 -
 llvm/test/CodeGen/BPF/memcmp.ll               |    77 -
 llvm/test/CodeGen/Generic/llc-start-stop.ll   |     6 +-
 llvm/test/CodeGen/LoongArch/opt-pipeline.ll   |     9 +-
 llvm/test/CodeGen/M68k/pipeline.ll            |     7 -
 llvm/test/CodeGen/PowerPC/O3-pipeline.ll      |     9 +-
 .../memCmpUsedInZeroEqualityComparison.ll     |   168 -
 .../CodeGen/PowerPC/memcmp-mergeexpand.ll     |    39 -
 llvm/test/CodeGen/PowerPC/memcmp.ll           |    62 -
 llvm/test/CodeGen/PowerPC/memcmpIR.ll         |   178 -
 llvm/test/CodeGen/RISCV/O3-pipeline.ll        |     9 +-
 llvm/test/CodeGen/X86/memcmp-mergeexpand.ll   |    49 -
 llvm/test/CodeGen/X86/memcmp-minsize-x32.ll   |   445 -
 llvm/test/CodeGen/X86/memcmp-minsize.ll       |   433 -
 .../CodeGen/X86/memcmp-more-load-pairs-x32.ll |  2911 ---
 .../CodeGen/X86/memcmp-more-load-pairs.ll     |  4006 ---
 llvm/test/CodeGen/X86/memcmp-optsize-x32.ll   |   583 -
 llvm/test/CodeGen/X86/memcmp-optsize.ll       |   596 -
 llvm/test/CodeGen/X86/memcmp-pgso-x32.ll      |   600 -
 llvm/test/CodeGen/X86/memcmp-pgso.ll          |   613 -
 llvm/test/CodeGen/X86/memcmp-x32.ll           |  2429 --
 llvm/test/CodeGen/X86/memcmp.ll               |  3065 ---
 llvm/test/CodeGen/X86/opt-pipeline.ll         |     9 +-
 llvm/test/Other/new-pm-defaults.ll            |     4 +-
 .../Other/new-pm-thinlto-postlink-defaults.ll |     4 +-
 .../new-pm-thinlto-postlink-pgo-defaults.ll   |     4 +-
 ...-pm-thinlto-postlink-samplepgo-defaults.ll |     4 +-
 .../Other/new-pm-thinlto-prelink-defaults.ll  |     4 +-
 .../new-pm-thinlto-prelink-pgo-defaults.ll    |    26 +-
 ...w-pm-thinlto-prelink-samplepgo-defaults.ll |     4 +-
 .../Transforms/ExpandMemCmp/AArch64/bcmp.ll   |   751 +
 .../ExpandMemCmp/AArch64/memcmp-extra.ll      |  3434 +++
 .../Transforms/ExpandMemCmp/AArch64/memcmp.ll |     1 -
 .../Transforms/ExpandMemCmp/BPF/lit.local.cfg |     4 +
 .../Transforms/ExpandMemCmp/BPF/memcmp.ll     |   119 +
 .../ExpandMemCmp/PowerPC/lit.local.cfg        |     2 +
 .../memCmpUsedInZeroEqualityComparison.ll     |   218 +
 .../PowerPC/memcmp-mergeexpand.ll             |    48 +
 .../Transforms/ExpandMemCmp/PowerPC/memcmp.ll |    70 +
 .../ExpandMemCmp/PowerPC/memcmpIR.ll          |   216 +
 llvm/test/Transforms/ExpandMemCmp/X86/bcmp.ll |    16 +-
 .../Transforms/ExpandMemCmp/X86/memcmp-2.ll   | 20249 ++++++++++++++++
 .../ExpandMemCmp}/X86/memcmp-constant.ll      |    89 +-
 .../ExpandMemCmp/X86/memcmp-minsize-x32.ll    |   493 +
 .../ExpandMemCmp/X86/memcmp-minsize.ll        |   707 +
 .../X86/memcmp-more-load-pairs-x32.ll         |  6203 +++++
 .../X86/memcmp-more-load-pairs.ll             | 18833 ++++++++++++++
 .../ExpandMemCmp/X86/memcmp-nobuiltin.ll      |   248 +
 .../ExpandMemCmp/X86/memcmp-optsize-x32.ll    |   870 +
 .../ExpandMemCmp/X86/memcmp-optsize.ll        |  1414 ++
 .../ExpandMemCmp/X86/memcmp-pgso-x32.ll       |   887 +
 .../ExpandMemCmp/X86/memcmp-pgso.ll           |  1347 +
 .../ExpandMemCmp/X86/memcmp-x32-2.ll          |  4813 ++++
 .../Transforms/ExpandMemCmp/X86/memcmp-x32.ll |   523 +-
 .../Transforms/ExpandMemCmp/X86/memcmp.ll     |  1194 +-
 .../PhaseOrdering/PowerPC/lit.local.cfg       |     2 +
 .../PhaseOrdering/X86/memcmp-early.ll         |    86 +
 .../PhaseOrdering/X86/memcmp-mergeexpand.ll   |    62 +
 .../Transforms/PhaseOrdering/X86/memcmp.ll    |   856 +
 llvm/tools/opt/opt.cpp                        |     1 -
 .../gn/secondary/llvm/lib/CodeGen/BUILD.gn    |     1 -
 .../llvm/lib/Transforms/Scalar/BUILD.gn       |     1 +
 83 files changed, 63081 insertions(+), 21075 deletions(-)
 rename llvm/include/llvm/{CodeGen => Transforms/Scalar}/ExpandMemCmp.h (83%)
 rename llvm/lib/{CodeGen => Transforms/Scalar}/ExpandMemCmp.cpp (90%)
 delete mode 100644 llvm/test/CodeGen/AArch64/bcmp-inline-small.ll
 delete mode 100644 llvm/test/CodeGen/AArch64/bcmp.ll
 delete mode 100644 llvm/test/CodeGen/AArch64/memcmp.ll
 delete mode 100644 llvm/test/CodeGen/BPF/memcmp.ll
 delete mode 100644 llvm/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll
 delete mode 100644 llvm/test/CodeGen/PowerPC/memcmp-mergeexpand.ll
 delete mode 100644 llvm/test/CodeGen/PowerPC/memcmp.ll
 delete mode 100644 llvm/test/CodeGen/PowerPC/memcmpIR.ll
 delete mode 100644 llvm/test/CodeGen/X86/memcmp-mergeexpand.ll
 delete mode 100644 llvm/test/CodeGen/X86/memcmp-minsize-x32.ll
 delete mode 100644 llvm/test/CodeGen/X86/memcmp-minsize.ll
 delete mode 100644 llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll
 delete mode 100644 llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
 delete mode 100644 llvm/test/CodeGen/X86/memcmp-optsize-x32.ll
 delete mode 100644 llvm/test/CodeGen/X86/memcmp-optsize.ll
 delete mode 100644 llvm/test/CodeGen/X86/memcmp-pgso-x32.ll
 delete mode 100644 llvm/test/CodeGen/X86/memcmp-pgso.ll
 delete mode 100644 llvm/test/CodeGen/X86/memcmp-x32.ll
 delete mode 100644 llvm/test/CodeGen/X86/memcmp.ll
 create mode 100644 llvm/test/Transforms/ExpandMemCmp/AArch64/bcmp.ll
 create mode 100644 llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp-extra.ll
 create mode 100644 llvm/test/Transforms/ExpandMemCmp/BPF/lit.local.cfg
 create mode 100644 llvm/test/Transforms/ExpandMemCmp/BPF/memcmp.ll
 create mode 100644 llvm/test/Transforms/ExpandMemCmp/PowerPC/lit.local.cfg
 create mode 100644 llvm/test/Transforms/ExpandMemCmp/PowerPC/memCmpUsedInZeroEqualityComparison.ll
 create mode 100644 llvm/test/Transforms/ExpandMemCmp/PowerPC/memcmp-mergeexpand.ll
 create mode 100644 llvm/test/Transforms/ExpandMemCmp/PowerPC/memcmp.ll
 create mode 100644 llvm/test/Transforms/ExpandMemCmp/PowerPC/memcmpIR.ll
 create mode 100644 llvm/test/Transforms/ExpandMemCmp/X86/memcmp-2.ll
 rename llvm/test/{CodeGen => Transforms/ExpandMemCmp}/X86/memcmp-constant.ll (50%)
 create mode 100644 llvm/test/Transforms/ExpandMemCmp/X86/memcmp-minsize-x32.ll
 create mode 100644 llvm/test/Transforms/ExpandMemCmp/X86/memcmp-minsize.ll
 create mode 100644 llvm/test/Transforms/ExpandMemCmp/X86/memcmp-more-load-pairs-x32.ll
 create mode 100644 llvm/test/Transforms/ExpandMemCmp/X86/memcmp-more-load-pairs.ll
 create mode 100644 llvm/test/Transforms/ExpandMemCmp/X86/memcmp-nobuiltin.ll
 create mode 100644 llvm/test/Transforms/ExpandMemCmp/X86/memcmp-optsize-x32.ll
 create mode 100644 llvm/test/Transforms/ExpandMemCmp/X86/memcmp-optsize.ll
 create mode 100644 llvm/test/Transforms/ExpandMemCmp/X86/memcmp-pgso-x32.ll
 create mode 100644 llvm/test/Transforms/ExpandMemCmp/X86/memcmp-pgso.ll
 create mode 100644 llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32-2.ll
 create mode 100644 llvm/test/Transforms/PhaseOrdering/PowerPC/lit.local.cfg
 create mode 100644 llvm/test/Transforms/PhaseOrdering/X86/memcmp-early.ll
 create mode 100644 llvm/test/Transforms/PhaseOrdering/X86/memcmp-mergeexpand.ll
 create mode 100644 llvm/test/Transforms/PhaseOrdering/X86/memcmp.ll

diff --git a/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h b/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h
index a7cbb0910baabf..556304231b397b 100644
--- a/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h
@@ -25,7 +25,6 @@
 #include "llvm/Analysis/TypeBasedAliasAnalysis.h"
 #include "llvm/CodeGen/CallBrPrepare.h"
 #include "llvm/CodeGen/DwarfEHPrepare.h"
-#include "llvm/CodeGen/ExpandMemCmp.h"
 #include "llvm/CodeGen/ExpandReductions.h"
 #include "llvm/CodeGen/GCMetadata.h"
 #include "llvm/CodeGen/IndirectBrExpand.h"
@@ -629,15 +628,6 @@ void CodeGenPassBuilder<Derived>::addIRPasses(AddIRPass &addPass) const {
       addPass(PrintFunctionPass(dbgs(), "\n\n*** Code after LSR ***\n"));
   }
 
-  if (getOptLevel() != CodeGenOptLevel::None) {
-    // The MergeICmpsPass tries to create memcmp calls by grouping sequences of
-    // loads and compares. ExpandMemCmpPass then tries to expand those calls
-    // into optimally-sized loads and compares. The transforms are enabled by a
-    // target lowering hook.
-    if (!Opt.DisableMergeICmps)
-      addPass(MergeICmpsPass());
-    addPass(ExpandMemCmpPass(&TM));
-  }
 
   // Run GC lowering passes for builtin collectors
   // TODO: add a pass insertion point here
diff --git a/llvm/include/llvm/CodeGen/MachinePassRegistry.def b/llvm/include/llvm/CodeGen/MachinePassRegistry.def
index f950dfae7e338b..3c00668aae3897 100644
--- a/llvm/include/llvm/CodeGen/MachinePassRegistry.def
+++ b/llvm/include/llvm/CodeGen/MachinePassRegistry.def
@@ -47,7 +47,6 @@ FUNCTION_PASS("dwarf-eh-prepare", DwarfEHPreparePass, (TM))
 FUNCTION_PASS("ee-instrument", EntryExitInstrumenterPass, (false))
 FUNCTION_PASS("expand-large-div-rem", ExpandLargeDivRemPass, (TM))
 FUNCTION_PASS("expand-large-fp-convert", ExpandLargeFpConvertPass, (TM))
-FUNCTION_PASS("expand-memcmp", ExpandMemCmpPass, (TM))
 FUNCTION_PASS("expand-reductions", ExpandReductionsPass, ())
 FUNCTION_PASS("expandvp", ExpandVectorPredicationPass, ())
 FUNCTION_PASS("indirectbr-expand", IndirectBrExpandPass, (TM))
@@ -55,7 +54,6 @@ FUNCTION_PASS("interleaved-access", InterleavedAccessPass, (TM))
 FUNCTION_PASS("interleaved-load-combine", InterleavedLoadCombinePass, (TM))
 FUNCTION_PASS("lower-constant-intrinsics", LowerConstantIntrinsicsPass, ())
 FUNCTION_PASS("lowerinvoke", LowerInvokePass, ())
-FUNCTION_PASS("mergeicmps", MergeICmpsPass, ())
 FUNCTION_PASS("partially-inline-libcalls", PartiallyInlineLibCallsPass, ())
 FUNCTION_PASS("post-inline-ee-instrument", EntryExitInstrumenterPass, (true))
 FUNCTION_PASS("replace-with-veclib", ReplaceWithVeclib, ())
diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index ca9fbb1def7624..e5ed5f15f62ed7 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -519,8 +519,6 @@ namespace llvm {
   // Expands large div/rem instructions.
   FunctionPass *createExpandLargeFpConvertPass();
 
-  // This pass expands memcmp() to load/stores.
-  FunctionPass *createExpandMemCmpLegacyPass();
 
   /// Creates Break False Dependencies pass. \see BreakFalseDeps.cpp
   FunctionPass *createBreakFalseDeps();
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 46b1e95c3c15f3..b0ca9fa942cda3 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -103,7 +103,6 @@ void initializeEdgeBundlesPass(PassRegistry&);
 void initializeEHContGuardCatchretPass(PassRegistry &);
 void initializeExpandLargeFpConvertLegacyPassPass(PassRegistry&);
 void initializeExpandLargeDivRemLegacyPassPass(PassRegistry&);
-void initializeExpandMemCmpLegacyPassPass(PassRegistry &);
 void initializeExpandPostRAPass(PassRegistry&);
 void initializeExpandReductionsPass(PassRegistry&);
 void initializeExpandVectorPredicationPass(PassRegistry &);
diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h
index 7a21876e565a7c..9aff428fbe938b 100644
--- a/llvm/include/llvm/LinkAllPasses.h
+++ b/llvm/include/llvm/LinkAllPasses.h
@@ -119,7 +119,6 @@ namespace {
       (void) llvm::createPostDomTree();
       (void) llvm::createMergeICmpsLegacyPass();
       (void) llvm::createExpandLargeDivRemPass();
-      (void)llvm::createExpandMemCmpLegacyPass();
       (void) llvm::createExpandVectorPredicationPass();
       std::string buf;
       llvm::raw_string_ostream os(buf);
diff --git a/llvm/include/llvm/CodeGen/ExpandMemCmp.h b/llvm/include/llvm/Transforms/Scalar/ExpandMemCmp.h
similarity index 83%
rename from llvm/include/llvm/CodeGen/ExpandMemCmp.h
rename to llvm/include/llvm/Transforms/Scalar/ExpandMemCmp.h
index 94a877854f327a..94ba0cf9305040 100644
--- a/llvm/include/llvm/CodeGen/ExpandMemCmp.h
+++ b/llvm/include/llvm/Transforms/Scalar/ExpandMemCmp.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_EXPANDMEMCMP_H
-#define LLVM_CODEGEN_EXPANDMEMCMP_H
+#ifndef LLVM_TRANSFORMS_SCALAR_EXPANDMEMCMP_H
+#define LLVM_TRANSFORMS_SCALAR_EXPANDMEMCMP_H
 
 #include "llvm/IR/PassManager.h"
 
@@ -26,4 +26,4 @@ class ExpandMemCmpPass : public PassInfoMixin<ExpandMemCmpPass> {
 
 } // namespace llvm
 
-#endif // LLVM_CODEGEN_EXPANDMEMCMP_H
+#endif // LLVM_TRANSFORMS_SCALAR_EXPANDMEMCMP_H
diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
index df2d1831ee5fdb..518432e9a7b32f 100644
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@@ -71,7 +71,6 @@ add_llvm_component_library(LLVMCodeGen
   ExecutionDomainFix.cpp
   ExpandLargeDivRem.cpp
   ExpandLargeFpConvert.cpp
-  ExpandMemCmp.cpp
   ExpandPostRAPseudos.cpp
   ExpandReductions.cpp
   ExpandVectorPredication.cpp
diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp
index 7b73a7b11ddf1c..043fa4e6eabe8f 100644
--- a/llvm/lib/CodeGen/CodeGen.cpp
+++ b/llvm/lib/CodeGen/CodeGen.cpp
@@ -41,7 +41,6 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeEarlyTailDuplicatePass(Registry);
   initializeExpandLargeDivRemLegacyPassPass(Registry);
   initializeExpandLargeFpConvertLegacyPassPass(Registry);
-  initializeExpandMemCmpLegacyPassPass(Registry);
   initializeExpandPostRAPass(Registry);
   initializeFEntryInserterPass(Registry);
   initializeFinalizeISelPass(Registry);
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index 4003a08a5422dd..33562e90e94426 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -108,9 +108,6 @@ static cl::opt<bool> EnableImplicitNullChecks(
     "enable-implicit-null-checks",
     cl::desc("Fold null checks into faulting memory operations"),
     cl::init(false), cl::Hidden);
-static cl::opt<bool> DisableMergeICmps("disable-mergeicmps",
-    cl::desc("Disable MergeICmps Pass"),
-    cl::init(false), cl::Hidden);
 static cl::opt<bool> PrintLSR("print-lsr-output", cl::Hidden,
     cl::desc("Print LLVM IR produced by the loop-reduce pass"));
 static cl::opt<bool>
@@ -487,7 +484,6 @@ CGPassBuilderOption llvm::getCGPassBuilderOption() {
   SET_BOOLEAN_OPTION(EnableImplicitNullChecks)
   SET_BOOLEAN_OPTION(EnableMachineOutliner)
   SET_BOOLEAN_OPTION(MISchedPostRA)
-  SET_BOOLEAN_OPTION(DisableMergeICmps)
   SET_BOOLEAN_OPTION(DisableLSR)
   SET_BOOLEAN_OPTION(DisableConstantHoisting)
   SET_BOOLEAN_OPTION(DisableCGP)
@@ -872,13 +868,6 @@ void TargetPassConfig::addIRPasses() {
                                         "\n\n*** Code after LSR ***\n"));
     }
 
-    // The MergeICmpsPass tries to create memcmp calls by grouping sequences of
-    // loads and compares. ExpandMemCmpPass then tries to expand those calls
-    // into optimally-sized loads and compares. The transforms are enabled by a
-    // target lowering hook.
-    if (!DisableMergeICmps)
-      addPass(createMergeICmpsLegacyPass());
-    addPass(createExpandMemCmpLegacyPass());
   }
 
   // Run GC lowering passes for builtin collectors
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 439f749bda8bb7..20448554756aca 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -76,7 +76,6 @@
 #include "llvm/CodeGen/DwarfEHPrepare.h"
 #include "llvm/CodeGen/ExpandLargeDivRem.h"
 #include "llvm/CodeGen/ExpandLargeFpConvert.h"
-#include "llvm/CodeGen/ExpandMemCmp.h"
 #include "llvm/CodeGen/GCMetadata.h"
 #include "llvm/CodeGen/HardwareLoops.h"
 #include "llvm/CodeGen/IndirectBrExpand.h"
@@ -181,6 +180,7 @@
 #include "llvm/Transforms/Scalar/DeadStoreElimination.h"
 #include "llvm/Transforms/Scalar/DivRemPairs.h"
 #include "llvm/Transforms/Scalar/EarlyCSE.h"
+#include "llvm/Transforms/Scalar/ExpandMemCmp.h"
 #include "llvm/Transforms/Scalar/FlattenCFG.h"
 #include "llvm/Transforms/Scalar/Float2Int.h"
 #include "llvm/Transforms/Scalar/GVN.h"
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 5c6c391049a7b2..e2dd413f12d696 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -86,6 +86,7 @@
 #include "llvm/Transforms/Scalar/DeadStoreElimination.h"
 #include "llvm/Transforms/Scalar/DivRemPairs.h"
 #include "llvm/Transforms/Scalar/EarlyCSE.h"
+#include "llvm/Transforms/Scalar/ExpandMemCmp.h"
 #include "llvm/Transforms/Scalar/Float2Int.h"
 #include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/Transforms/Scalar/IndVarSimplify.h"
@@ -111,6 +112,7 @@
 #include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h"
 #include "llvm/Transforms/Scalar/LowerMatrixIntrinsics.h"
 #include "llvm/Transforms/Scalar/MemCpyOptimizer.h"
+#include "llvm/Transforms/Scalar/MergeICmps.h"
 #include "llvm/Transforms/Scalar/MergedLoadStoreMotion.h"
 #include "llvm/Transforms/Scalar/NewGVN.h"
 #include "llvm/Transforms/Scalar/Reassociate.h"
@@ -386,6 +388,8 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level,
   if (AreStatisticsEnabled())
     FPM.addPass(CountVisitsPass());
 
+  FPM.addPass(MergeICmpsPass());
+  FPM.addPass(ExpandMemCmpPass(TM));
   // Form SSA out of local memory accesses after breaking apart aggregates into
   // scalars.
   FPM.addPass(SROAPass(SROAOptions::ModifyCFG));
@@ -532,6 +536,8 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   if (AreStatisticsEnabled())
     FPM.addPass(CountVisitsPass());
 
+  FPM.addPass(MergeICmpsPass());
+  FPM.addPass(ExpandMemCmpPass(TM));
   // Form SSA out of local memory accesses after breaking apart aggregates into
   // scalars.
   FPM.addPass(SROAPass(SROAOptions::ModifyCFG));
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 82ce040c649626..31adbf1942b410 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -353,6 +353,7 @@ FUNCTION_PASS("mem2reg", PromotePass())
 FUNCTION_PASS("memcpyopt", MemCpyOptPass())
 FUNCTION_PASS("memprof", MemProfilerPass())
 FUNCTION_PASS("mergeicmps", MergeICmpsPass())
+FUNCTION_PASS("expand-memcmp", ExpandMemCmpPass(TM))
 FUNCTION_PASS("mergereturn", UnifyFunctionExitNodesPass())
 FUNCTION_PASS("move-auto-init", MoveAutoInitPass())
 FUNCTION_PASS("nary-reassociate", NaryReassociatePass())
@@ -415,7 +416,7 @@ FUNCTION_PASS("structurizecfg", StructurizeCFGPass())
 FUNCTION_PASS("tailcallelim", TailCallElimPass())
 FUNCTION_PASS("tlshoist", TLSVariableHoistPass())
 FUNCTION_PASS("transform-warning", WarnMissedTransformationsPass())
-FUNCTION_PASS("trigger-verifier-error", TriggerVerifierErrorPass())  
+FUNCTION_PASS("trigger-verifier-error", TriggerVerifierErrorPass())
 FUNCTION_PASS("tsan", ThreadSanitizerPass())
 FUNCTION_PASS("typepromotion", TypePromotionPass(TM))
 FUNCTION_PASS("unify-loop-exits", UnifyLoopExitsPass())
diff --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt b/llvm/lib/Transforms/Scalar/CMakeLists.txt
index 2dd27037a17de7..f6e666dd071256 100644
--- a/llvm/lib/Transforms/Scalar/CMakeLists.txt
+++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt
@@ -11,6 +11,7 @@ add_llvm_component_library(LLVMScalarOpts
   DeadStoreElimination.cpp
   DFAJumpThreading.cpp
   DivRemPairs.cpp
+  ExpandMemCmp.cpp
   EarlyCSE.cpp
   FlattenCFGPass.cpp
   Float2Int.cpp
diff --git a/llvm/lib/CodeGen/ExpandMemCmp.cpp b/llvm/lib/Transforms/Scalar/ExpandMemCmp.cpp
similarity index 90%
rename from llvm/lib/CodeGen/ExpandMemCmp.cpp
rename to llvm/lib/Transforms/Scalar/ExpandMemCmp.cpp
index bb84813569f4d5..973875ee142978 100644
--- a/llvm/lib/CodeGen/ExpandMemCmp.cpp
+++ b/llvm/lib/Transforms/Scalar/ExpandMemCmp.cpp
@@ -11,21 +11,22 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/ExpandMemCmp.h"
+#include "llvm/Transforms/Scalar/ExpandMemCmp.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LazyBlockFrequencyInfo.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -35,9 +36,6 @@
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
-namespace llvm {
-class TargetLowering;
-}
 
 #define DEBUG_TYPE "expand-memcmp"
 
@@ -305,6 +303,7 @@ unsigned MemCmpExpansion::getNumBlocks() {
 }
 
 void MemCmpExpansion::createLoadCmpBlocks() {
+  assert(ResBlock.BB && "ResBlock must be created before LoadCmpBlocks");
   for (unsigned i = 0; i < getNumBlocks(); i++) {
     BasicBlock *BB = BasicBlock::Create(CI->getContext(), "loadbb",
                                         EndBlock->getParent(), EndBlock);
@@ -313,6 +312,7 @@ void MemCmpExpansion::createLoadCmpBlocks() {
 }
 
 void MemCmpExpansion::createResultBlock() {
+  assert(EndBlock && "EndBlock must be created before ResultBlock");
   ResBlock.BB = BasicBlock::Create(CI->getContext(), "res_block",
                                    EndBlock->getParent(), EndBlock);
 }
@@ -828,9 +828,9 @@ Value *MemCmpExpansion::getMemCmpExpansion() {
 ///  %phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ]
 ///  ret i32 %phi.res
 static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI,
-                         const TargetLowering *TLI, const DataLayout *DL,
-                         ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI,
-                         DomTreeUpdater *DTU, const bool IsBCmp) {
+                         const DataLayout *DL, ProfileSummaryInfo *PSI,
+                         BlockFrequencyInfo *BFI, DomTreeUpdater *DTU,
+                         const bool IsBCmp) {
   NumMemCmpCalls++;
 
   // Early exit from expansion if -Oz.
@@ -845,9 +845,7 @@ static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI,
   }
   const uint64_t SizeVal = SizeCast->getZExtValue();
 
-  if (SizeVal == 0) {
-    return false;
-  }
+
   // TTI call to check if target would like to expand memcmp. Also, get the
   // available load sizes.
   const bool IsUsedForZeroCmp =
@@ -857,28 +855,33 @@ static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI,
   auto Options = TTI->enableMemCmpExpansion(OptForSize,
                                             IsUsedForZeroCmp);
   if (!Options) return false;
+  Value *Res = nullptr;
 
-  if (MemCmpEqZeroNumLoadsPerBlock.getNumOccurrences())
-    Options.NumLoadsPerBlock = MemCmpEqZeroNumLoadsPerBlock;
-
-  if (OptForSize &&
-      MaxLoadsPerMemcmpOptSize.getNumOccurrences())
-    Options.MaxNumLoads = MaxLoadsPerMemcmpOptSize;
+  if (SizeVal == 0) {
+    Res = ConstantInt::get(CI->getFunctionType()->getReturnType(), 0);
+  } else {
+    if (MemCmpEqZeroNumLoadsPerBlock.getNumOccurrences())
+      Options.NumLoadsPerBlock = MemCmpEqZeroNumLoadsPerBlock;
 
-  if (!OptForSize && MaxLoadsPerMemcmp.getNumOccurrences())
-    Options.MaxNumLoads = MaxLoadsPerMemcmp;
+    if (OptForSize &&
+        MaxLoadsPerMemcmpOptSize.getNumOccurrences())
+      Options.MaxNumLoads = MaxLoadsPerMemcmpOptSize;
 
-  MemCmpExpansion Expansion(CI, SizeVal, Options, IsUsedForZeroCmp, *DL, DTU);
+    if (!OptForSize && MaxLoadsPerMemcmp.getNumOccurrences())
+      Options.MaxNumLoads = MaxLoadsPerMemcmp;
 
-  // Don't expand if this will require more loads than desired by the target.
-  if (Expansion.getNumLoads() == 0) {
-    NumMemCmpGreaterThanMax++;
-    return false;
-  }
+    MemCmpExpansion Expansion(CI, SizeVal, Options, IsUsedForZeroCmp, *DL, DTU);
 
-  NumMemCmpInlined++;
+    // Don't expand if this will require more loads than desired by the target.
+    if (Expansion.getNumLoads() == 0) {
+      NumMemCmpGreaterThanMax++;
+      return false;
+    }
 
-  if (Value *Res = Expansion.getMemCmpExpansion()) {
+    NumMemCmpInlined++;
+    Res = Expansion.getMemCmpExpansion();
+  }
+  if (Res) {
     // Replace call with result of expansion and erase call.
     CI->replaceAllUsesWith(Res);
     CI->eraseFromParent();
@@ -889,62 +892,18 @@ static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI,
 
 // Returns true if a change was made.
 static bool runOnBlock(BasicBlock &BB, const TargetLibraryInfo *TLI,
-                       const TargetTransformInfo *TTI, const TargetLowering *TL,
+                       const TargetTransformInfo *TTI,
                        const DataLayout &DL, ProfileSummaryInfo *PSI,
                        BlockFrequencyInfo *BFI, DomTreeUpdater *DTU);
 
 static PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI,
                                  const TargetTransformInfo *TTI,
-                                 const TargetLowering *TL,
                                  ProfileSummaryInfo *PSI,
                                  BlockFrequencyInfo *BFI, DominatorTree *DT);
 
-class ExpandMemCmpLegacyPass : public FunctionPass {
-public:
-  static char ID;
-
-  ExpandMemCmpLegacyPass() : FunctionPass(ID) {
-    initializeExpandMemCmpLegacyPassPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnFunction(Function &F) override {
-    if (skipFunction(F)) return false;
-
-    auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
-    if (!TPC) {
-      return false;
-    }
-    const TargetLowering* TL =
-        TPC->getTM<TargetMachine>().getSubtargetImpl(F)->getTargetLowering();
-
-    const TargetLibraryInfo *TLI =
-        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
-    const TargetTransformInfo *TTI =
-        &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-    auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
-    auto *BFI = (PSI && PSI->hasProfileSummary()) ?
-           &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI() :
-           nullptr;
-    DominatorTree *DT = nullptr;
-    if (auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>())
-      DT = &DTWP->getDomTree();
-    auto PA = runImpl(F, TLI, TTI, TL, PSI, BFI, DT);
-    return !PA.areAllPreserved();
-  }
-
-private:
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<TargetLibraryInfoWrapperPass>();
-    AU.addRequired<TargetTransformInfoWrapperPass>();
-    AU.addRequired<ProfileSummaryInfoWrapperPass>();
-    AU.addPreserved<DominatorTreeWrapperPass>();
-    LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU);
-    FunctionPass::getAnalysisUsage(AU);
-  }
-};
 
 bool runOnBlock(BasicBlock &BB, const TargetLibraryInfo *TLI,
-                const TargetTransformInfo *TTI, const TargetLowering *TL,
+                const TargetTransformInfo *TTI,
                 const DataLayout &DL, ProfileSummaryInfo *PSI,
                 BlockFrequencyInfo *BFI, DomTreeUpdater *DTU) {
   for (Instruction &I : BB) {
@@ -955,7 +914,7 @@ bool runOnBlock(BasicBlock &BB, const TargetLibraryInfo *TLI,
     LibFunc Func;
     if (TLI->getLibFunc(*CI, Func) &&
         (Func == LibFunc_memcmp || Func == LibFunc_bcmp) &&
-        expandMemCmp(CI, TTI, TL, &DL, PSI, BFI, DTU, Func == LibFunc_bcmp)) {
+        expandMemCmp(CI, TTI, &DL, PSI, BFI, DTU, Func == LibFunc_bcmp)) {
       return true;
     }
   }
@@ -963,8 +922,7 @@ bool runOnBlock(BasicBlock &BB, const TargetLibraryInfo *TLI,
 }
 
 PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI,
-                          const TargetTransformInfo *TTI,
-                          const TargetLowering *TL, ProfileSummaryInfo *PSI,
+                          const TargetTransformInfo *TTI, ProfileSummaryInfo *PSI,
                           BlockFrequencyInfo *BFI, DominatorTree *DT) {
   std::optional<DomTreeUpdater> DTU;
   if (DT)
@@ -973,7 +931,7 @@ PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI,
   const DataLayout& DL = F.getParent()->getDataLayout();
   bool MadeChanges = false;
   for (auto BBIt = F.begin(); BBIt != F.end();) {
-    if (runOnBlock(*BBIt, TLI, TTI, TL, DL, PSI, BFI, DTU ? &*DTU : nullptr)) {
+    if (runOnBlock(*BBIt, TLI, TTI, DL, PSI, BFI, DTU ? &*DTU : nullptr)) {
       MadeChanges = true;
       // If changes were made, restart the function from the beginning, since
       // the structure of the function was changed.
@@ -996,7 +954,6 @@ PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI,
 
 PreservedAnalyses ExpandMemCmpPass::run(Function &F,
                                         FunctionAnalysisManager &FAM) {
-  const auto *TL = TM->getSubtargetImpl(F)->getTargetLowering();
   const auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
   const auto &TTI = FAM.getResult<TargetIRAnalysis>(F);
   auto *PSI = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(F)
@@ -1005,21 +962,5 @@ PreservedAnalyses ExpandMemCmpPass::run(Function &F,
                                 ? &FAM.getResult<BlockFrequencyAnalysis>(F)
                                 : nullptr;
   auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
-
-  return runImpl(F, &TLI, &TTI, TL, PSI, BFI, DT);
-}
-
-char ExpandMemCmpLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(ExpandMemCmpLegacyPass, DEBUG_TYPE,
-                      "Expand memcmp() to load/stores", false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LazyBlockFrequencyInfoPass)
-INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(ExpandMemCmpLegacyPass, DEBUG_TYPE,
-                    "Expand memcmp() to load/stores", false, false)
-
-FunctionPass *llvm::createExpandMemCmpLegacyPass() {
-  return new ExpandMemCmpLegacyPass();
+  return runImpl(F, &TLI, &TTI, PSI, BFI, DT);
 }
diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
index 638f26298ee26a..c96c1edebaf8cc 100644
--- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
@@ -43,13 +43,6 @@
 ; CHECK-NEXT:         Canonicalize Freeze Instructions in Loops
 ; CHECK-NEXT:         Induction Variable Users
 ; CHECK-NEXT:         Loop Strength Reduction
-; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
-; CHECK-NEXT:       Function Alias Analysis Results
-; CHECK-NEXT:       Merge contiguous icmps into a memcmp
-; CHECK-NEXT:       Natural Loop Information
-; CHECK-NEXT:       Lazy Branch Probability Analysis
-; CHECK-NEXT:       Lazy Block Frequency Analysis
-; CHECK-NEXT:       Expand memcmp() to load/stores
 ; CHECK-NEXT:       Lower Garbage Collection Instructions
 ; CHECK-NEXT:       Shadow Stack GC Lowering
 ; CHECK-NEXT:       Lower constant intrinsics
diff --git a/llvm/test/CodeGen/AArch64/bcmp-inline-small.ll b/llvm/test/CodeGen/AArch64/bcmp-inline-small.ll
deleted file mode 100644
index 4846c46e648178..00000000000000
--- a/llvm/test/CodeGen/AArch64/bcmp-inline-small.ll
+++ /dev/null
@@ -1,98 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -O2 < %s -mtriple=aarch64-linux-gnu                     | FileCheck %s --check-prefix=CHECKN
-; RUN: llc -O2 < %s -mtriple=aarch64-linux-gnu -mattr=strict-align | FileCheck %s --check-prefix=CHECKS
-
-declare i32 @bcmp(ptr, ptr, i64) nounwind readonly
-declare i32 @memcmp(ptr, ptr, i64) nounwind readonly
-
-define i1 @test_b2(ptr %s1, ptr %s2) {
-; CHECKN-LABEL: test_b2:
-; CHECKN:       // %bb.0: // %entry
-; CHECKN-NEXT:    ldr x8, [x0]
-; CHECKN-NEXT:    ldr x9, [x1]
-; CHECKN-NEXT:    ldur x10, [x0, #7]
-; CHECKN-NEXT:    ldur x11, [x1, #7]
-; CHECKN-NEXT:    cmp x8, x9
-; CHECKN-NEXT:    ccmp x10, x11, #0, eq
-; CHECKN-NEXT:    cset w0, eq
-; CHECKN-NEXT:    ret
-;
-; CHECKS-LABEL: test_b2:
-; CHECKS:       // %bb.0: // %entry
-; CHECKS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECKS-NEXT:    .cfi_def_cfa_offset 16
-; CHECKS-NEXT:    .cfi_offset w30, -16
-; CHECKS-NEXT:    mov w2, #15 // =0xf
-; CHECKS-NEXT:    bl bcmp
-; CHECKS-NEXT:    cmp w0, #0
-; CHECKS-NEXT:    cset w0, eq
-; CHECKS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECKS-NEXT:    ret
-entry:
-  %bcmp = call i32 @bcmp(ptr %s1, ptr %s2, i64 15)
-  %ret = icmp eq i32 %bcmp, 0
-  ret i1 %ret
-}
-
-; TODO: Four loads should be within the limit, but the heuristic isn't implemented.
-define i1 @test_b2_align8(ptr align 8 %s1, ptr align 8 %s2) {
-; CHECKN-LABEL: test_b2_align8:
-; CHECKN:       // %bb.0: // %entry
-; CHECKN-NEXT:    ldr x8, [x0]
-; CHECKN-NEXT:    ldr x9, [x1]
-; CHECKN-NEXT:    ldur x10, [x0, #7]
-; CHECKN-NEXT:    ldur x11, [x1, #7]
-; CHECKN-NEXT:    cmp x8, x9
-; CHECKN-NEXT:    ccmp x10, x11, #0, eq
-; CHECKN-NEXT:    cset w0, eq
-; CHECKN-NEXT:    ret
-;
-; CHECKS-LABEL: test_b2_align8:
-; CHECKS:       // %bb.0: // %entry
-; CHECKS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECKS-NEXT:    .cfi_def_cfa_offset 16
-; CHECKS-NEXT:    .cfi_offset w30, -16
-; CHECKS-NEXT:    mov w2, #15 // =0xf
-; CHECKS-NEXT:    bl bcmp
-; CHECKS-NEXT:    cmp w0, #0
-; CHECKS-NEXT:    cset w0, eq
-; CHECKS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECKS-NEXT:    ret
-entry:
-  %bcmp = call i32 @bcmp(ptr %s1, ptr %s2, i64 15)
-  %ret = icmp eq i32 %bcmp, 0
-  ret i1 %ret
-}
-
-define i1 @test_bs(ptr %s1, ptr %s2) optsize {
-; CHECKN-LABEL: test_bs:
-; CHECKN:       // %bb.0: // %entry
-; CHECKN-NEXT:    ldp x8, x11, [x1]
-; CHECKN-NEXT:    ldr x12, [x0, #16]
-; CHECKN-NEXT:    ldp x9, x10, [x0]
-; CHECKN-NEXT:    ldr x13, [x1, #16]
-; CHECKN-NEXT:    cmp x9, x8
-; CHECKN-NEXT:    ldur x8, [x0, #23]
-; CHECKN-NEXT:    ldur x9, [x1, #23]
-; CHECKN-NEXT:    ccmp x10, x11, #0, eq
-; CHECKN-NEXT:    ccmp x12, x13, #0, eq
-; CHECKN-NEXT:    ccmp x8, x9, #0, eq
-; CHECKN-NEXT:    cset w0, eq
-; CHECKN-NEXT:    ret
-;
-; CHECKS-LABEL: test_bs:
-; CHECKS:       // %bb.0: // %entry
-; CHECKS-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECKS-NEXT:    .cfi_def_cfa_offset 16
-; CHECKS-NEXT:    .cfi_offset w30, -16
-; CHECKS-NEXT:    mov w2, #31 // =0x1f
-; CHECKS-NEXT:    bl memcmp
-; CHECKS-NEXT:    cmp w0, #0
-; CHECKS-NEXT:    cset w0, eq
-; CHECKS-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECKS-NEXT:    ret
-entry:
-  %memcmp = call i32 @memcmp(ptr %s1, ptr %s2, i64 31)
-  %ret = icmp eq i32 %memcmp, 0
-  ret i1 %ret
-}
diff --git a/llvm/test/CodeGen/AArch64/bcmp.ll b/llvm/test/CodeGen/AArch64/bcmp.ll
deleted file mode 100644
index fee52ead989629..00000000000000
--- a/llvm/test/CodeGen/AArch64/bcmp.ll
+++ /dev/null
@@ -1,537 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -O2 < %s -mtriple=aarch64-linux-gnu                     | FileCheck %s
-
-declare i32 @bcmp(ptr, ptr, i64)
-
-define i1 @bcmp0(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp0:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w0, #1 // =0x1
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 0)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-define i1 @bcmp1(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp1:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrb w8, [x0]
-; CHECK-NEXT:    ldrb w9, [x1]
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 1)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-define i1 @bcmp2(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp2:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    ldrh w9, [x1]
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 2)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-; or (and (xor a, b), C1), (and (xor c, d), C2)
-define i1 @bcmp3(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp3:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    ldrh w9, [x1]
-; CHECK-NEXT:    ldrb w10, [x0, #2]
-; CHECK-NEXT:    ldrb w11, [x1, #2]
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    ccmp w10, w11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 3)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-define i1 @bcmp4(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp4:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    ldr w9, [x1]
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 4)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-; or (xor a, b), (and (xor c, d), C2)
-define i1 @bcmp5(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp5:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    ldr w9, [x1]
-; CHECK-NEXT:    ldrb w10, [x0, #4]
-; CHECK-NEXT:    ldrb w11, [x1, #4]
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    ccmp w10, w11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 5)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-; or (xor a, b), (and (xor c, d), C2)
-define i1 @bcmp6(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp6:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    ldr w9, [x1]
-; CHECK-NEXT:    ldrh w10, [x0, #4]
-; CHECK-NEXT:    ldrh w11, [x1, #4]
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    ccmp w10, w11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 6)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-; or (xor a, b), (xor c, d)
-define i1 @bcmp7(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp7:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    ldr w9, [x1]
-; CHECK-NEXT:    ldur w10, [x0, #3]
-; CHECK-NEXT:    ldur w11, [x1, #3]
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    ccmp w10, w11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 7)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-define i1 @bcmp8(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 8)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-; or (xor a, b), (and (xor c, d), C2)
-define i1 @bcmp9(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp9:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    ldrb w10, [x0, #8]
-; CHECK-NEXT:    ldrb w11, [x1, #8]
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 9)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-define i1 @bcmp10(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp10:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    ldrh w10, [x0, #8]
-; CHECK-NEXT:    ldrh w11, [x1, #8]
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 10)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-define i1 @bcmp11(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp11:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    ldur x10, [x0, #3]
-; CHECK-NEXT:    ldur x11, [x1, #3]
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 11)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-define i1 @bcmp12(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp12:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    ldr w10, [x0, #8]
-; CHECK-NEXT:    ldr w11, [x1, #8]
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 12)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-define i1 @bcmp13(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp13:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    ldur x10, [x0, #5]
-; CHECK-NEXT:    ldur x11, [x1, #5]
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 13)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-define i1 @bcmp14(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp14:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    ldur x10, [x0, #6]
-; CHECK-NEXT:    ldur x11, [x1, #6]
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 14)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-define i1 @bcmp15(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp15:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    ldur x10, [x0, #7]
-; CHECK-NEXT:    ldur x11, [x1, #7]
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 15)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-define i1 @bcmp16(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 16)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-define i1 @bcmp20(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp20:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldr w12, [x0, #16]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    ldr w13, [x1, #16]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    ccmp x12, x13, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 20)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-define i1 @bcmp24(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp24:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldr x12, [x0, #16]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    ldr x13, [x1, #16]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    ccmp x12, x13, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 24)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-define i1 @bcmp28(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp28:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldr x12, [x0, #16]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    ldr x13, [x1, #16]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ldr w8, [x0, #24]
-; CHECK-NEXT:    ldr w9, [x1, #24]
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    ccmp x12, x13, #0, eq
-; CHECK-NEXT:    ccmp x8, x9, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 28)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-define i1 @bcmp33(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp33:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    ldp x12, x13, [x1, #16]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ldp x8, x9, [x0, #16]
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    ldrb w10, [x0, #32]
-; CHECK-NEXT:    ldrb w11, [x1, #32]
-; CHECK-NEXT:    ccmp x8, x12, #0, eq
-; CHECK-NEXT:    ccmp x9, x13, #0, eq
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 33)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-define i1 @bcmp38(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp38:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    ldp x12, x13, [x1, #16]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ldp x8, x9, [x0, #16]
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    ldur x10, [x0, #30]
-; CHECK-NEXT:    ldur x11, [x1, #30]
-; CHECK-NEXT:    ccmp x8, x12, #0, eq
-; CHECK-NEXT:    ccmp x9, x13, #0, eq
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 38)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-define i1 @bcmp45(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp45:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    ldp x12, x13, [x1, #16]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ldp x8, x9, [x0, #16]
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    ldr x10, [x0, #32]
-; CHECK-NEXT:    ldr x11, [x1, #32]
-; CHECK-NEXT:    ccmp x8, x12, #0, eq
-; CHECK-NEXT:    ldur x8, [x0, #37]
-; CHECK-NEXT:    ldur x12, [x1, #37]
-; CHECK-NEXT:    ccmp x9, x13, #0, eq
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    ccmp x8, x12, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 45)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-; Although the large cmp chain may be not profitable on high end CPU, we
-; believe it is better on most cpus, so perform the transform now.
-; 8 xor + 7 or + 1 cmp only need 6 cycles on a 4 width ALU port machine
-;   2 cycle for xor
-;   3 cycle for or
-;   1 cycle for cmp
-define i1 @bcmp64(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    ldp x12, x13, [x1, #16]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ldp x8, x9, [x0, #16]
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    ccmp x8, x12, #0, eq
-; CHECK-NEXT:    ldp x8, x11, [x0, #32]
-; CHECK-NEXT:    ldp x10, x12, [x1, #32]
-; CHECK-NEXT:    ccmp x9, x13, #0, eq
-; CHECK-NEXT:    ldp x9, x13, [x1, #48]
-; CHECK-NEXT:    ccmp x8, x10, #0, eq
-; CHECK-NEXT:    ldp x8, x10, [x0, #48]
-; CHECK-NEXT:    ccmp x11, x12, #0, eq
-; CHECK-NEXT:    ccmp x8, x9, #0, eq
-; CHECK-NEXT:    ccmp x10, x13, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 64)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-define i1 @bcmp89(ptr %a, ptr %b) {
-; CHECK-LABEL: bcmp89:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    mov w2, #89 // =0x59
-; CHECK-NEXT:    bl bcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 89)
-  %r = icmp eq i32 %cr, 0
-  ret i1 %r
-}
-
-define i1 @bcmp_zext(i32 %0, i32 %1, i8 %2, i8 %3) {
-; CHECK-LABEL: bcmp_zext:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w2, #0xff
-; CHECK-NEXT:    and w9, w3, #0xff
-; CHECK-NEXT:    cmp w1, w0
-; CHECK-NEXT:    ccmp w9, w8, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %5 = xor i32 %1, %0
-  %6 = xor i8 %3, %2
-  %7 = zext i8 %6 to i32
-  %8 = or i32 %5, %7
-  %9 = icmp eq i32 %8, 0
-  ret i1 %9
-}
-
-define i1 @bcmp_i8(i8 %a0, i8 %b0, i8 %a1, i8 %b1, i8 %a2, i8 %b2) {
-; CHECK-LABEL: bcmp_i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w1, #0xff
-; CHECK-NEXT:    and w9, w2, #0xff
-; CHECK-NEXT:    and w10, w3, #0xff
-; CHECK-NEXT:    cmp w8, w0, uxtb
-; CHECK-NEXT:    and w8, w4, #0xff
-; CHECK-NEXT:    and w11, w5, #0xff
-; CHECK-NEXT:    ccmp w10, w9, #0, eq
-; CHECK-NEXT:    ccmp w11, w8, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %xor0 = xor i8 %b0, %a0
-  %xor1 = xor i8 %b1, %a1
-  %xor2 = xor i8 %b2, %a2
-  %or0 = or i8 %xor0, %xor1
-  %or1 = or i8 %or0, %xor2
-  %r = icmp eq i8 %or1, 0
-  ret i1 %r
-}
-
-define i1 @bcmp_i16(i16 %a0, i16 %b0, i16 %a1, i16 %b1, i16 %a2, i16 %b2) {
-; CHECK-LABEL: bcmp_i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w1, #0xffff
-; CHECK-NEXT:    and w9, w2, #0xffff
-; CHECK-NEXT:    and w10, w3, #0xffff
-; CHECK-NEXT:    cmp w8, w0, uxth
-; CHECK-NEXT:    and w8, w4, #0xffff
-; CHECK-NEXT:    and w11, w5, #0xffff
-; CHECK-NEXT:    ccmp w10, w9, #0, eq
-; CHECK-NEXT:    ccmp w11, w8, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %xor0 = xor i16 %b0, %a0
-  %xor1 = xor i16 %b1, %a1
-  %xor2 = xor i16 %b2, %a2
-  %or0 = or i16 %xor0, %xor1
-  %or1 = or i16 %or0, %xor2
-  %r = icmp eq i16 %or1, 0
-  ret i1 %r
-}
-
-define i1 @bcmp_i128(i128 %a0, i128 %b0, i128 %a1, i128 %b1, i128 %a2, i128 %b2) {
-; CHECK-LABEL: bcmp_i128:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmp x2, x0
-; CHECK-NEXT:    ldp x8, x10, [sp]
-; CHECK-NEXT:    ccmp x3, x1, #0, eq
-; CHECK-NEXT:    ldp x9, x11, [sp, #16]
-; CHECK-NEXT:    ccmp x6, x4, #0, eq
-; CHECK-NEXT:    ccmp x7, x5, #0, eq
-; CHECK-NEXT:    cset w12, ne
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ccmp x11, x10, #0, eq
-; CHECK-NEXT:    csinc w0, w12, wzr, eq
-; CHECK-NEXT:    ret
-  %xor0 = xor i128 %b0, %a0
-  %xor1 = xor i128 %b1, %a1
-  %xor2 = xor i128 %b2, %a2
-  %or0 = or i128 %xor0, %xor1
-  %or1 = or i128 %or0, %xor2
-  %r = icmp ne i128 %or1, 0
-  ret i1 %r
-}
-
-define i1 @bcmp_i42(i42 %a0, i42 %b0, i42 %a1, i42 %b1, i42 %a2, i42 %b2) {
-; CHECK-LABEL: bcmp_i42:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    and x8, x0, #0x3ffffffffff
-; CHECK-NEXT:    and x9, x1, #0x3ffffffffff
-; CHECK-NEXT:    and x10, x2, #0x3ffffffffff
-; CHECK-NEXT:    and x11, x3, #0x3ffffffffff
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    and x8, x4, #0x3ffffffffff
-; CHECK-NEXT:    and x9, x5, #0x3ffffffffff
-; CHECK-NEXT:    ccmp x11, x10, #0, eq
-; CHECK-NEXT:    ccmp x9, x8, #0, eq
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-  %xor0 = xor i42 %b0, %a0
-  %xor1 = xor i42 %b1, %a1
-  %xor2 = xor i42 %b2, %a2
-  %or0 = or i42 %xor0, %xor1
-  %or1 = or i42 %or0, %xor2
-  %r = icmp ne i42 %or1, 0
-  ret i1 %r
-}
diff --git a/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll b/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll
index a48a4e0e723ebc..855a5b23f6c1cc 100644
--- a/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll
+++ b/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 define i1 @combine_setcc_eq_vecreduce_or_v8i1(<8 x i8> %a) {
@@ -266,8 +266,18 @@ define i1 @combine_setcc_eq0_conjunction_xor_or(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ccmp x10, x11, #0, eq
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
-  %bcmp = tail call i32 @bcmp(ptr dereferenceable(16) %a, ptr dereferenceable(16) %b, i64 16)
-  %cmp = icmp eq i32 %bcmp, 0
+  %1 = load i64, ptr %a, align 1
+  %2 = load i64, ptr %b, align 1
+  %3 = xor i64 %1, %2
+  %4 = getelementptr i8, ptr %a, i64 8
+  %5 = getelementptr i8, ptr %b, i64 8
+  %6 = load i64, ptr %4, align 1
+  %7 = load i64, ptr %5, align 1
+  %8 = xor i64 %6, %7
+  %9 = or i64 %3, %8
+  %10 = icmp ne i64 %9, 0
+  %11 = zext i1 %10 to i32
+  %cmp = icmp eq i32 %11, 0
   ret i1 %cmp
 }
 
@@ -280,9 +290,18 @@ define i1 @combine_setcc_ne0_conjunction_xor_or(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ccmp x10, x11, #0, eq
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
-  %bcmp = tail call i32 @bcmp(ptr dereferenceable(16) %a, ptr dereferenceable(16) %b, i64 16)
-  %cmp = icmp ne i32 %bcmp, 0
-  ret i1 %cmp
+  %1 = load i64, ptr %a, align 1
+  %2 = load i64, ptr %b, align 1
+  %3 = xor i64 %1, %2
+  %4 = getelementptr i8, ptr %a, i64 8
+  %5 = getelementptr i8, ptr %b, i64 8
+  %6 = load i64, ptr %4, align 1
+  %7 = load i64, ptr %5, align 1
+  %8 = xor i64 %6, %7
+  %9 = or i64 %3, %8
+  %10 = icmp ne i64 %9, 0
+  %11 = zext i1 %10 to i32
+  ret i1 %10
 }
 
 ; Doesn't increase the number of instructions, where the LHS has multiple uses
diff --git a/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll b/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll
index 30123a31cebbe9..fc0bc1b9661163 100644
--- a/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll
+++ b/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll
@@ -25,20 +25,23 @@ define i64 @one_dimensional(ptr %a, ptr %b, i64 %N) {
 entry:
   br label %for.body
 
-for.body:                                         ; preds = %entry, %for.body
+for.body:                                         ; preds = %for.body, %entry
   %i.06 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
   %sum.05 = phi i64 [ %spec.select, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %i.06
   %0 = load ptr, ptr %arrayidx, align 8
-  %bcmp = tail call i32 @bcmp(ptr %0, ptr %b, i64 4)
-  %tobool = icmp eq i32 %bcmp, 0
+  %1 = load i32, ptr %0, align 1
+  %2 = load i32, ptr %b, align 1
+  %3 = icmp ne i32 %1, %2
+  %4 = zext i1 %3 to i32
+  %tobool = icmp eq i32 %4, 0
   %add = zext i1 %tobool to i64
   %spec.select = add i64 %sum.05, %add
   %inc = add nuw i64 %i.06, 1
   %exitcond = icmp eq i64 %inc, %N
   br i1 %exitcond, label %for.exit, label %for.body
 
-for.exit:                                 ; preds = %for.body
+for.exit:                                         ; preds = %for.body
   ret i64 %spec.select
 }
 
@@ -79,32 +82,35 @@ define i64 @two_dimensional(ptr %a, ptr %b, i64 %N, i64 %M) {
 entry:
   br label %for.cond1.preheader
 
-for.cond1.preheader:                           ; preds = %entry, %for.cond1.for.exit3_crit_edge
+for.cond1.preheader:                              ; preds = %for.cond1.for.exit3_crit_edge, %entry
   %i.019 = phi i64 [ %inc7, %for.cond1.for.exit3_crit_edge ], [ 0, %entry ]
   %sum.018 = phi i64 [ %spec.select, %for.cond1.for.exit3_crit_edge ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %i.019
   %0 = load ptr, ptr %arrayidx, align 8
   br label %for.body4
 
-for.body4:                                     ; preds = %for.cond1.preheader, %for.body4
+for.body4:                                        ; preds = %for.body4, %for.cond1.preheader
   %j.016 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body4 ]
   %sum.115 = phi i64 [ %sum.018, %for.cond1.preheader ], [ %spec.select, %for.body4 ]
   %arrayidx5 = getelementptr inbounds ptr, ptr %0, i64 %j.016
   %1 = load ptr, ptr %arrayidx5, align 8
-  %bcmp = tail call i32 @bcmp(ptr %1, ptr %b, i64 4)
-  %tobool = icmp eq i32 %bcmp, 0
+  %2 = load i32, ptr %1, align 1
+  %3 = load i32, ptr %b, align 1
+  %4 = icmp ne i32 %2, %3
+  %5 = zext i1 %4 to i32
+  %tobool = icmp eq i32 %5, 0
   %add = zext i1 %tobool to i64
   %spec.select = add i64 %sum.115, %add
   %inc = add nuw i64 %j.016, 1
   %exitcond = icmp eq i64 %inc, %M
   br i1 %exitcond, label %for.cond1.for.exit3_crit_edge, label %for.body4
 
-for.cond1.for.exit3_crit_edge:         ; preds = %for.body4
+for.cond1.for.exit3_crit_edge:                    ; preds = %for.body4
   %inc7 = add nuw i64 %i.019, 1
   %exitcond22 = icmp eq i64 %inc7, %N
   br i1 %exitcond22, label %for.exit, label %for.cond1.preheader
 
-for.exit:                                 ; preds = %for.cond1.for.exit3_crit_edge
+for.exit:                                         ; preds = %for.cond1.for.exit3_crit_edge
   ret i64 %spec.select
 }
 
@@ -159,44 +165,47 @@ define i64 @three_dimensional(ptr %a, ptr %b, i64 %N, i64 %M, i64 %K) {
 entry:
   br label %for.cond1.preheader
 
-for.cond1.preheader:                        ; preds = %entry, %for.cond1.for.cond
+for.cond1.preheader:                              ; preds = %for.cond1.for.cond, %entry
   %i.033 = phi i64 [ %inc15, %for.cond1.for.cond ], [ 0, %entry ]
   %sum.032 = phi i64 [ %spec.select, %for.cond1.for.cond ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %i.033
   %0 = load ptr, ptr %arrayidx, align 8
   br label %for.cond5.preheader
 
-for.cond5.preheader:                     ; preds = %for.cond5.for.cond, %for.cond1.preheader
+for.cond5.preheader:                              ; preds = %for.cond5.for.cond, %for.cond1.preheader
   %j.029 = phi i64 [ 0, %for.cond1.preheader ], [ %inc12, %for.cond5.for.cond ]
   %sum.128 = phi i64 [ %sum.032, %for.cond1.preheader ], [ %spec.select, %for.cond5.for.cond ]
   %arrayidx9 = getelementptr inbounds ptr, ptr %0, i64 %j.029
   %1 = load ptr, ptr %arrayidx9, align 8
   br label %for.body8
 
-for.body8:                               ; preds = %for.body8, %for.cond5.preheader
+for.body8:                                        ; preds = %for.body8, %for.cond5.preheader
   %k.026 = phi i64 [ 0, %for.cond5.preheader ], [ %inc, %for.body8 ]
   %sum.225 = phi i64 [ %sum.128, %for.cond5.preheader ], [ %spec.select, %for.body8 ]
   %arrayidx10 = getelementptr inbounds ptr, ptr %1, i64 %k.026
   %2 = load ptr, ptr %arrayidx10, align 8
-  %bcmp = tail call i32 @bcmp(ptr %2, ptr %b, i64 4)
-  %tobool = icmp eq i32 %bcmp, 0
+  %3 = load i32, ptr %2, align 1
+  %4 = load i32, ptr %b, align 1
+  %5 = icmp ne i32 %3, %4
+  %6 = zext i1 %5 to i32
+  %tobool = icmp eq i32 %6, 0
   %add = zext i1 %tobool to i64
   %spec.select = add i64 %sum.225, %add
   %inc = add nuw i64 %k.026, 1
   %exitcond = icmp eq i64 %inc, %K
   br i1 %exitcond, label %for.cond5.for.cond, label %for.body8
 
-for.cond5.for.cond:   ; preds = %for.body8
+for.cond5.for.cond:                               ; preds = %for.body8
   %inc12 = add nuw i64 %j.029, 1
   %exitcond44 = icmp eq i64 %inc12, %M
   br i1 %exitcond44, label %for.cond1.for.cond, label %for.cond5.preheader
 
-for.cond1.for.cond: ; preds = %for.cond5.for.cond
+for.cond1.for.cond:                               ; preds = %for.cond5.for.cond
   %inc15 = add nuw i64 %i.033, 1
   %exitcond45 = icmp eq i64 %inc15, %N
   br i1 %exitcond45, label %for.exit, label %for.cond1.preheader
 
-for.exit:                                 ; preds = %for.cond1.for.cond
+for.exit:                                         ; preds = %for.cond1.for.cond
   ret i64 %spec.select
 }
 
@@ -254,14 +263,14 @@ define i64 @three_dimensional_middle(ptr %a, ptr %b, i64 %N, i64 %M, i64 %K) {
 entry:
   br label %for.cond1.preheader
 
-for.cond1.preheader:                        ; preds = %entry, %for.cond1.for.cond
+for.cond1.preheader:                              ; preds = %for.cond1.for.cond, %entry
   %i.035 = phi i64 [ %inc16, %for.cond1.for.cond ], [ 0, %entry ]
   %sum.034 = phi i64 [ %spec.select, %for.cond1.for.cond ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %i.035
   %0 = load ptr, ptr %arrayidx, align 8
   br label %for.cond5.preheader
 
-for.cond5.preheader:                     ; preds = %for.cond5.for.cond, %for.cond1.preheader
+for.cond5.preheader:                              ; preds = %for.cond5.for.cond, %for.cond1.preheader
   %j.031 = phi i64 [ 0, %for.cond1.preheader ], [ %inc13, %for.cond5.for.cond ]
   %sum.130 = phi i64 [ %sum.034, %for.cond1.preheader ], [ %spec.select, %for.cond5.for.cond ]
   %arrayidx9 = getelementptr inbounds ptr, ptr %0, i64 %j.031
@@ -270,30 +279,33 @@ for.cond5.preheader:                     ; preds = %for.cond5.for.cond, %for.con
   %2 = load ptr, ptr %arrayidx11, align 8
   br label %for.body8
 
-for.body8:                               ; preds = %for.body8, %for.cond5.preheader
+for.body8:                                        ; preds = %for.body8, %for.cond5.preheader
   %k.028 = phi i64 [ 0, %for.cond5.preheader ], [ %inc, %for.body8 ]
   %sum.227 = phi i64 [ %sum.130, %for.cond5.preheader ], [ %spec.select, %for.body8 ]
   %arrayidx10 = getelementptr inbounds ptr, ptr %1, i64 %k.028
   %3 = load ptr, ptr %arrayidx10, align 8
-  %bcmp = tail call i32 @bcmp(ptr %3, ptr %2, i64 4)
-  %tobool = icmp eq i32 %bcmp, 0
+  %4 = load i32, ptr %3, align 1
+  %5 = load i32, ptr %2, align 1
+  %6 = icmp ne i32 %4, %5
+  %7 = zext i1 %6 to i32
+  %tobool = icmp eq i32 %7, 0
   %add = zext i1 %tobool to i64
   %spec.select = add i64 %sum.227, %add
   %inc = add nuw i64 %k.028, 1
   %exitcond = icmp eq i64 %inc, %K
   br i1 %exitcond, label %for.cond5.for.cond, label %for.body8
 
-for.cond5.for.cond:   ; preds = %for.body8
+for.cond5.for.cond:                               ; preds = %for.body8
   %inc13 = add nuw i64 %j.031, 1
   %exitcond46 = icmp eq i64 %inc13, %M
   br i1 %exitcond46, label %for.cond1.for.cond, label %for.cond5.preheader
 
-for.cond1.for.cond: ; preds = %for.cond5.for.cond
+for.cond1.for.cond:                               ; preds = %for.cond5.for.cond
   %inc16 = add nuw i64 %i.035, 1
   %exitcond47 = icmp eq i64 %inc16, %N
   br i1 %exitcond47, label %for.exit, label %for.cond1.preheader
 
-for.exit:                                 ; preds = %for.cond1.for.cond
+for.exit:                                         ; preds = %for.cond1.for.cond
   ret i64 %spec.select
 }
 
@@ -328,19 +340,27 @@ for.body.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
   br label %for.body
 
-for.body:                                         ; preds = %for.body.preheader, %for.body
+for.body:                                         ; preds = %for.body, %for.body.preheader
   %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
   %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %indvars.iv
   %0 = load ptr, ptr %arrayidx, align 8
-  %call = tail call i32 @memcmp(ptr %0, ptr %b, i64 4)
-  %conv = trunc i32 %call to i8
+  %1 = load i32, ptr %0, align 1
+  %2 = load i32, ptr %b, align 1
+  %3 = call i32 @llvm.bswap.i32(i32 %1)
+  %4 = call i32 @llvm.bswap.i32(i32 %2)
+  %5 = icmp ugt i32 %3, %4
+  %6 = icmp ult i32 %3, %4
+  %7 = zext i1 %5 to i32
+  %8 = zext i1 %6 to i32
+  %9 = sub i32 %7, %8
+  %conv = trunc i32 %9 to i8
   %arrayidx2 = getelementptr inbounds i8, ptr %c, i64 %indvars.iv
   store i8 %conv, ptr %arrayidx2, align 1
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
   br i1 %exitcond.not, label %for.exit, label %for.body
 
-for.exit:                                 ; preds = %for.body
+for.exit:                                         ; preds = %for.body
   ret void
 }
 
@@ -385,13 +405,16 @@ for.body.preheader:                               ; preds = %entry
   %wide.trip.count = zext i32 %N to i64
   br label %for.body
 
-for.body:                                         ; preds = %for.body.preheader, %for.body
+for.body:                                         ; preds = %for.body, %for.body.preheader
   %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
   %sum.05 = phi i32 [ 0, %for.body.preheader ], [ %spec.select, %for.body ]
   %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %indvars.iv
   %0 = load ptr, ptr %arrayidx, align 8
-  %bcmp = tail call i32 @bcmp(ptr %0, ptr %b, i64 4)
-  %tobool.not = icmp eq i32 %bcmp, 0
+  %1 = load i32, ptr %0, align 1
+  %2 = load i32, ptr %b, align 1
+  %3 = icmp ne i32 %1, %2
+  %4 = zext i1 %3 to i32
+  %tobool.not = icmp eq i32 %4, 0
   %add = zext i1 %tobool.not to i32
   %spec.select = add nuw nsw i32 %sum.05, %add
   tail call void @func()
@@ -399,7 +422,7 @@ for.body:                                         ; preds = %for.body.preheader,
   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
   br i1 %exitcond.not, label %for.exit, label %for.body
 
-for.exit:                                 ; preds = %for.body
+for.exit:                                         ; preds = %for.body
   ret i32 %spec.select
 }
 
@@ -431,20 +454,32 @@ define i64 @one_dimensional_two_loads(ptr %a, ptr %b, i64 %N) {
 entry:
   br label %for.body
 
-for.body:                                         ; preds = %entry, %for.body
+for.body:                                         ; preds = %for.body, %entry
   %i.06 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
   %sum.05 = phi i64 [ %spec.select, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %i.06
   %0 = load ptr, ptr %arrayidx, align 8
-  %bcmp = tail call i32 @bcmp(ptr %0, ptr %b, i64 6)
-  %tobool = icmp eq i32 %bcmp, 0
+  %1 = load i32, ptr %0, align 1
+  %2 = load i32, ptr %b, align 1
+  %3 = xor i32 %1, %2
+  %4 = getelementptr i8, ptr %0, i64 4
+  %5 = getelementptr i8, ptr %b, i64 4
+  %6 = load i16, ptr %4, align 1
+  %7 = load i16, ptr %5, align 1
+  %8 = zext i16 %6 to i32
+  %9 = zext i16 %7 to i32
+  %10 = xor i32 %8, %9
+  %11 = or i32 %3, %10
+  %12 = icmp ne i32 %11, 0
+  %13 = zext i1 %12 to i32
+  %tobool = icmp eq i32 %13, 0
   %add = zext i1 %tobool to i64
   %spec.select = add i64 %sum.05, %add
   %inc = add nuw i64 %i.06, 1
   %exitcond = icmp eq i64 %inc, %N
   br i1 %exitcond, label %for.exit, label %for.body
 
-for.exit:                                 ; preds = %for.body
+for.exit:                                         ; preds = %for.body
   ret i64 %spec.select
 }
 
@@ -475,18 +510,18 @@ define i64 @hoisting_no_cse(ptr %a, ptr %b, ptr %c, i64 %N) {
 ; CHECK-NEXT:    mov x0, x8
 ; CHECK-NEXT:    ret
 entry:
-  %b.val = load i64, ptr %b
+  %b.val = load i64, ptr %b, align 8
   %b.val.changed = add i64 %b.val, 1
-  store i64 %b.val.changed, ptr %c
+  store i64 %b.val.changed, ptr %c, align 8
   br label %for.body
 
-for.body:                                         ; preds = %entry, %for.body
+for.body:                                         ; preds = %for.body, %entry
   %idx = phi i64 [ %inc, %for.body ], [ 0, %entry ]
   %sum = phi i64 [ %spec.select, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %idx
   %0 = load ptr, ptr %arrayidx, align 8
-  %x = load i64, ptr %0
-  %y = load i64, ptr %b
+  %x = load i64, ptr %0, align 8
+  %y = load i64, ptr %b, align 8
   %cmp = icmp eq i64 %x, %y
   %add = zext i1 %cmp to i64
   %spec.select = add i64 %sum, %add
@@ -494,10 +529,15 @@ for.body:                                         ; preds = %entry, %for.body
   %exitcond = icmp eq i64 %inc, %N
   br i1 %exitcond, label %for.exit, label %for.body
 
-for.exit:                                 ; preds = %for.body
+for.exit:                                         ; preds = %for.body
   ret i64 %spec.select
 }
 
 declare i32 @bcmp(ptr, ptr, i64)
 declare i32 @memcmp(ptr, ptr, i64)
 declare void @func()
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i32 @llvm.bswap.i32(i32) #0
+
+attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/llvm/test/CodeGen/AArch64/memcmp.ll b/llvm/test/CodeGen/AArch64/memcmp.ll
deleted file mode 100644
index 4da7c8c95a4e4f..00000000000000
--- a/llvm/test/CodeGen/AArch64/memcmp.ll
+++ /dev/null
@@ -1,3029 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s
-
- at .str = private constant [513 x i8] c"01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901\00", align 1
-
-declare dso_local i32 @memcmp(ptr, ptr, i64)
-
-define i32 @length0(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length0:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
-   ret i32 %m
- }
-
-define i1 @length0_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length0_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w0, #1 // =0x1
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length0_lt(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length0_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length2(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length2:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    ldrh w9, [x1]
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    lsr w8, w8, #16
-; CHECK-NEXT:    sub w0, w8, w9, lsr #16
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
-  ret i32 %m
-}
-
-define i32 @length2_const(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length2_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w9, [x0]
-; CHECK-NEXT:    mov w8, #-12594 // =0xffffcece
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    add w0, w8, w9, lsr #16
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
-  ret i32 %m
-}
-
-define i1 @length2_gt_const(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length2_gt_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w9, [x0]
-; CHECK-NEXT:    mov w8, #-12594 // =0xffffcece
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    add w8, w8, w9, lsr #16
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
-  %c = icmp sgt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length2_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    ldrh w9, [x1]
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length2_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    ldrh w9, [x1]
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    lsr w8, w8, #16
-; CHECK-NEXT:    sub w8, w8, w9, lsr #16
-; CHECK-NEXT:    lsr w0, w8, #31
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_gt(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length2_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    ldrh w9, [x1]
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    lsr w8, w8, #16
-; CHECK-NEXT:    sub w8, w8, w9, lsr #16
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
-  %c = icmp sgt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length2_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    mov w9, #12849 // =0x3231
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length2_eq_nobuiltin_attr:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #2 // =0x2
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind nobuiltin
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length3(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length3:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrb w8, [x0, #2]
-; CHECK-NEXT:    ldrh w9, [x0]
-; CHECK-NEXT:    ldrb w10, [x1, #2]
-; CHECK-NEXT:    ldrh w11, [x1]
-; CHECK-NEXT:    orr w8, w9, w8, lsl #16
-; CHECK-NEXT:    orr w9, w11, w10, lsl #16
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    cset w8, hi
-; CHECK-NEXT:    cset w9, lo
-; CHECK-NEXT:    sub w0, w8, w9
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
-  ret i32 %m
-}
-
-define i1 @length3_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length3_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    ldrh w9, [x1]
-; CHECK-NEXT:    ldrb w10, [x0, #2]
-; CHECK-NEXT:    ldrb w11, [x1, #2]
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    ccmp w10, w11, #0, eq
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length4(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length4:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    ldr w9, [x1]
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    cset w8, hi
-; CHECK-NEXT:    cset w9, lo
-; CHECK-NEXT:    sub w0, w8, w9
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
-  ret i32 %m
-}
-
-define i1 @length4_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length4_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    ldr w9, [x1]
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length4_lt(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length4_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    ldr w9, [x1]
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    cset w0, lo
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length4_lt_32(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length4_lt_32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    ldr w9, [x1]
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    cset w0, lo
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
-  %c = lshr i32 %m, 31
-  ret i32 %c
-}
-
-define i1 @length4_gt(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length4_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    ldr w9, [x1]
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    cset w0, hi
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
-  %c = icmp sgt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length4_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length4_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    mov w9, #12849 // =0x3231
-; CHECK-NEXT:    movk w9, #13363, lsl #16
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 4) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length5(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length5:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrb w8, [x0, #4]
-; CHECK-NEXT:    ldr w9, [x0]
-; CHECK-NEXT:    ldrb w10, [x1, #4]
-; CHECK-NEXT:    ldr w11, [x1]
-; CHECK-NEXT:    orr x8, x9, x8, lsl #32
-; CHECK-NEXT:    orr x9, x11, x10, lsl #32
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    cset w8, hi
-; CHECK-NEXT:    cset w9, lo
-; CHECK-NEXT:    sub w0, w8, w9
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
-  ret i32 %m
-}
-
-define i1 @length5_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length5_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    ldr w9, [x1]
-; CHECK-NEXT:    ldrb w10, [x0, #4]
-; CHECK-NEXT:    ldrb w11, [x1, #4]
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    ccmp w10, w11, #0, eq
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length5_lt(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length5_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrb w8, [x0, #4]
-; CHECK-NEXT:    ldr w9, [x0]
-; CHECK-NEXT:    ldrb w10, [x1, #4]
-; CHECK-NEXT:    ldr w11, [x1]
-; CHECK-NEXT:    orr x8, x9, x8, lsl #32
-; CHECK-NEXT:    orr x9, x11, x10, lsl #32
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    cset w0, lo
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length6(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length6:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0, #4]
-; CHECK-NEXT:    ldr w9, [x0]
-; CHECK-NEXT:    ldrh w10, [x1, #4]
-; CHECK-NEXT:    ldr w11, [x1]
-; CHECK-NEXT:    orr x8, x9, x8, lsl #32
-; CHECK-NEXT:    orr x9, x11, x10, lsl #32
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    cset w8, hi
-; CHECK-NEXT:    cset w9, lo
-; CHECK-NEXT:    sub w0, w8, w9
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 6) nounwind
-  ret i32 %m
-}
-
-define i32 @length6_lt(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length6_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0, #4]
-; CHECK-NEXT:    ldr w9, [x0]
-; CHECK-NEXT:    ldrh w10, [x1, #4]
-; CHECK-NEXT:    ldr w11, [x1]
-; CHECK-NEXT:    orr x8, x9, x8, lsl #32
-; CHECK-NEXT:    orr x9, x11, x10, lsl #32
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    cset w0, lo
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 6) nounwind
-  %r = lshr i32 %m, 31
-  ret i32 %r
-}
-
-define i32 @length7(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length7:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    ldr w9, [x1]
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    b.ne .LBB24_3
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldur w8, [x0, #3]
-; CHECK-NEXT:    ldur w9, [x1, #3]
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    b.ne .LBB24_3
-; CHECK-NEXT:  // %bb.2:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB24_3: // %res_block
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w0, w8, hs
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
-  ret i32 %m
-}
-
-define i1 @length7_lt(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length7_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    ldr w9, [x1]
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    b.ne .LBB25_3
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldur w8, [x0, #3]
-; CHECK-NEXT:    ldur w9, [x1, #3]
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    b.ne .LBB25_3
-; CHECK-NEXT:  // %bb.2:
-; CHECK-NEXT:    lsr w0, wzr, #31
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB25_3: // %res_block
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:    lsr w0, w8, #31
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length7_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length7_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    ldr w9, [x1]
-; CHECK-NEXT:    ldur w10, [x0, #3]
-; CHECK-NEXT:    ldur w11, [x1, #3]
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    ccmp w10, w11, #0, eq
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length8(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    cset w8, hi
-; CHECK-NEXT:    cset w9, lo
-; CHECK-NEXT:    sub w0, w8, w9
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
-  ret i32 %m
-}
-
-define i1 @length8_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length8_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length8_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length8_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x9, #12592 // =0x3130
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    movk x9, #13106, lsl #16
-; CHECK-NEXT:    movk x9, #13620, lsl #32
-; CHECK-NEXT:    movk x9, #14134, lsl #48
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 8) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length9(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length9:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB30_2
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldrb w8, [x0, #8]
-; CHECK-NEXT:    ldrb w9, [x1, #8]
-; CHECK-NEXT:    sub w0, w8, w9
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB30_2: // %res_block
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w0, w8, hs
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9) nounwind
-  ret i32 %m
-}
-
-define i1 @length9_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length9_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    ldrb w10, [x0, #8]
-; CHECK-NEXT:    ldrb w11, [x1, #8]
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length10(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length10:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB32_3
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldrh w8, [x0, #8]
-; CHECK-NEXT:    ldrh w9, [x1, #8]
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    lsr w8, w8, #16
-; CHECK-NEXT:    lsr w9, w9, #16
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB32_3
-; CHECK-NEXT:  // %bb.2:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB32_3: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w0, w8, hs
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 10) nounwind
-  ret i32 %m
-}
-
-define i1 @length10_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length10_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    ldrh w10, [x0, #8]
-; CHECK-NEXT:    ldrh w11, [x1, #8]
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 10) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length11(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length11:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB34_3
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldur x8, [x0, #3]
-; CHECK-NEXT:    ldur x9, [x1, #3]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB34_3
-; CHECK-NEXT:  // %bb.2:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB34_3: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w0, w8, hs
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 11) nounwind
-  ret i32 %m
-}
-
-define i1 @length11_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length11_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    ldur x10, [x0, #3]
-; CHECK-NEXT:    ldur x11, [x1, #3]
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 11) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length12_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length12_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    ldr w10, [x0, #8]
-; CHECK-NEXT:    ldr w11, [x1, #8]
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length12(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length12:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB37_3
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr w8, [x0, #8]
-; CHECK-NEXT:    ldr w9, [x1, #8]
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB37_3
-; CHECK-NEXT:  // %bb.2:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB37_3: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w0, w8, hs
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
-  ret i32 %m
-}
-
-define i1 @length13_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length13_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    ldur x10, [x0, #5]
-; CHECK-NEXT:    ldur x11, [x1, #5]
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 13) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length14_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length14_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    ldur x10, [x0, #6]
-; CHECK-NEXT:    ldur x11, [x1, #6]
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 14) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length15(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length15:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB40_3
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldur x8, [x0, #7]
-; CHECK-NEXT:    ldur x9, [x1, #7]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB40_3
-; CHECK-NEXT:  // %bb.2:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB40_3: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w0, w8, hs
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) nounwind
-  ret i32 %m
-}
-
-define i1 @length15_lt(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length15_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB41_3
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldur x8, [x0, #7]
-; CHECK-NEXT:    ldur x9, [x1, #7]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB41_3
-; CHECK-NEXT:  // %bb.2:
-; CHECK-NEXT:    lsr w0, wzr, #31
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB41_3: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:    lsr w0, w8, #31
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length15_const(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length15_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #14136 // =0x3738
-; CHECK-NEXT:    ldr x9, [x0]
-; CHECK-NEXT:    movk x8, #13622, lsl #16
-; CHECK-NEXT:    movk x8, #13108, lsl #32
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    movk x8, #12594, lsl #48
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    b.ne .LBB42_3
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    mov x8, #13365 // =0x3435
-; CHECK-NEXT:    ldur x9, [x0, #7]
-; CHECK-NEXT:    movk x8, #12851, lsl #16
-; CHECK-NEXT:    movk x8, #12337, lsl #32
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    movk x8, #14393, lsl #48
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    b.ne .LBB42_3
-; CHECK-NEXT:  // %bb.2:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB42_3: // %res_block
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w0, w8, hs
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 15) nounwind
-  ret i32 %m
-}
-
-define i1 @length15_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length15_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    ldur x10, [x0, #7]
-; CHECK-NEXT:    ldur x11, [x1, #7]
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length15_gt_const(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length15_gt_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #14136 // =0x3738
-; CHECK-NEXT:    ldr x9, [x0]
-; CHECK-NEXT:    movk x8, #13622, lsl #16
-; CHECK-NEXT:    movk x8, #13108, lsl #32
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    movk x8, #12594, lsl #48
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    b.ne .LBB44_3
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    mov x8, #13365 // =0x3435
-; CHECK-NEXT:    ldur x9, [x0, #7]
-; CHECK-NEXT:    movk x8, #12851, lsl #16
-; CHECK-NEXT:    movk x8, #12337, lsl #32
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    movk x8, #14393, lsl #48
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    b.ne .LBB44_3
-; CHECK-NEXT:  // %bb.2:
-; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    b .LBB44_4
-; CHECK-NEXT:  .LBB44_3: // %res_block
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:  .LBB44_4: // %endblock
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 15) nounwind
-  %c = icmp sgt i32 %m, 0
-  ret i1 %c
-}
-
-
-define i32 @length16(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB45_3
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB45_3
-; CHECK-NEXT:  // %bb.2:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB45_3: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w0, w8, hs
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 16) nounwind
-  ret i32 %m
-}
-
-define i1 @length16_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length16_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length16_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB47_3
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB47_3
-; CHECK-NEXT:  // %bb.2:
-; CHECK-NEXT:    lsr w0, wzr, #31
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB47_3: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:    lsr w0, w8, #31
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length16_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB48_3
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB48_3
-; CHECK-NEXT:  // %bb.2:
-; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    b .LBB48_4
-; CHECK-NEXT:  .LBB48_3: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:  .LBB48_4: // %endblock
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length16_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #12592 // =0x3130
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    movk x8, #13106, lsl #16
-; CHECK-NEXT:    movk x8, #13620, lsl #32
-; CHECK-NEXT:    movk x8, #14134, lsl #48
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    mov x8, #14648 // =0x3938
-; CHECK-NEXT:    movk x8, #12592, lsl #16
-; CHECK-NEXT:    movk x8, #13106, lsl #32
-; CHECK-NEXT:    movk x8, #13620, lsl #48
-; CHECK-NEXT:    ccmp x10, x8, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 16) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-
-define i32 @length24(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length24:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB50_4
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB50_4
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB50_4
-; CHECK-NEXT:  // %bb.3:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB50_4: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w0, w8, hs
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 24) nounwind
-  ret i32 %m
-}
-
-define i1 @length24_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length24_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldr x12, [x0, #16]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    ldr x13, [x1, #16]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    ccmp x12, x13, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length24_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB52_4
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB52_4
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB52_4
-; CHECK-NEXT:  // %bb.3:
-; CHECK-NEXT:    lsr w0, wzr, #31
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB52_4: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:    lsr w0, w8, #31
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length24_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB53_4
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB53_4
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB53_4
-; CHECK-NEXT:  // %bb.3:
-; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    b .LBB53_5
-; CHECK-NEXT:  .LBB53_4: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:  .LBB53_5: // %endblock
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length24_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #12592 // =0x3130
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    movk x8, #13106, lsl #16
-; CHECK-NEXT:    ldr x11, [x0, #16]
-; CHECK-NEXT:    movk x8, #13620, lsl #32
-; CHECK-NEXT:    movk x8, #14134, lsl #48
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    mov x8, #14648 // =0x3938
-; CHECK-NEXT:    movk x8, #12592, lsl #16
-; CHECK-NEXT:    movk x8, #13106, lsl #32
-; CHECK-NEXT:    movk x8, #13620, lsl #48
-; CHECK-NEXT:    ccmp x10, x8, #0, eq
-; CHECK-NEXT:    mov x8, #14134 // =0x3736
-; CHECK-NEXT:    movk x8, #14648, lsl #16
-; CHECK-NEXT:    movk x8, #12592, lsl #32
-; CHECK-NEXT:    movk x8, #13106, lsl #48
-; CHECK-NEXT:    ccmp x11, x8, #0, eq
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 24) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length31(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length31:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB55_5
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB55_5
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB55_5
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldur x8, [x0, #23]
-; CHECK-NEXT:    ldur x9, [x1, #23]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB55_5
-; CHECK-NEXT:  // %bb.4:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB55_5: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w0, w8, hs
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 31) nounwind
-  ret i32 %m
-}
-
-define i1 @length31_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length31_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldr x12, [x0, #16]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    ldr x13, [x1, #16]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ldur x8, [x0, #23]
-; CHECK-NEXT:    ldur x9, [x1, #23]
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    ccmp x12, x13, #0, eq
-; CHECK-NEXT:    ccmp x8, x9, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length31_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length31_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB57_5
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB57_5
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB57_5
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldur x8, [x0, #23]
-; CHECK-NEXT:    ldur x9, [x1, #23]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB57_5
-; CHECK-NEXT:  // %bb.4:
-; CHECK-NEXT:    lsr w0, wzr, #31
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB57_5: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:    lsr w0, w8, #31
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length31_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length31_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB58_5
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB58_5
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB58_5
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldur x8, [x0, #23]
-; CHECK-NEXT:    ldur x9, [x1, #23]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB58_5
-; CHECK-NEXT:  // %bb.4:
-; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    b .LBB58_6
-; CHECK-NEXT:  .LBB58_5: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:  .LBB58_6: // %endblock
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length31_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
-; CHECK-LABEL: length31_eq_prefer128:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldr x12, [x0, #16]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    ldr x13, [x1, #16]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ldur x8, [x0, #23]
-; CHECK-NEXT:    ldur x9, [x1, #23]
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    ccmp x12, x13, #0, eq
-; CHECK-NEXT:    ccmp x8, x9, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length31_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length31_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #12592 // =0x3130
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    movk x8, #13106, lsl #16
-; CHECK-NEXT:    ldr x11, [x0, #16]
-; CHECK-NEXT:    movk x8, #13620, lsl #32
-; CHECK-NEXT:    movk x8, #14134, lsl #48
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    mov x8, #14648 // =0x3938
-; CHECK-NEXT:    ldur x9, [x0, #23]
-; CHECK-NEXT:    movk x8, #12592, lsl #16
-; CHECK-NEXT:    movk x8, #13106, lsl #32
-; CHECK-NEXT:    movk x8, #13620, lsl #48
-; CHECK-NEXT:    ccmp x10, x8, #0, eq
-; CHECK-NEXT:    mov x8, #14134 // =0x3736
-; CHECK-NEXT:    movk x8, #14648, lsl #16
-; CHECK-NEXT:    movk x8, #12592, lsl #32
-; CHECK-NEXT:    movk x8, #13106, lsl #48
-; CHECK-NEXT:    ccmp x11, x8, #0, eq
-; CHECK-NEXT:    mov x8, #13363 // =0x3433
-; CHECK-NEXT:    movk x8, #13877, lsl #16
-; CHECK-NEXT:    movk x8, #14391, lsl #32
-; CHECK-NEXT:    movk x8, #12345, lsl #48
-; CHECK-NEXT:    ccmp x9, x8, #0, eq
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 31) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length32(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB61_5
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB61_5
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB61_5
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldr x8, [x0, #24]
-; CHECK-NEXT:    ldr x9, [x1, #24]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB61_5
-; CHECK-NEXT:  // %bb.4:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB61_5: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w0, w8, hs
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 32) nounwind
-  ret i32 %m
-}
-
-
-define i1 @length32_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length32_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    ldp x12, x13, [x1, #16]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ldp x8, x9, [x0, #16]
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    ccmp x8, x12, #0, eq
-; CHECK-NEXT:    ccmp x9, x13, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length32_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB63_5
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB63_5
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB63_5
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldr x8, [x0, #24]
-; CHECK-NEXT:    ldr x9, [x1, #24]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB63_5
-; CHECK-NEXT:  // %bb.4:
-; CHECK-NEXT:    lsr w0, wzr, #31
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB63_5: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:    lsr w0, w8, #31
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length32_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB64_5
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB64_5
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB64_5
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldr x8, [x0, #24]
-; CHECK-NEXT:    ldr x9, [x1, #24]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB64_5
-; CHECK-NEXT:  // %bb.4:
-; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    b .LBB64_6
-; CHECK-NEXT:  .LBB64_5: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:  .LBB64_6: // %endblock
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
-; CHECK-LABEL: length32_eq_prefer128:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    ldp x12, x13, [x1, #16]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ldp x8, x9, [x0, #16]
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    ccmp x8, x12, #0, eq
-; CHECK-NEXT:    ccmp x9, x13, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length32_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #12592 // =0x3130
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    movk x8, #13106, lsl #16
-; CHECK-NEXT:    movk x8, #13620, lsl #32
-; CHECK-NEXT:    movk x8, #14134, lsl #48
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    mov x8, #14648 // =0x3938
-; CHECK-NEXT:    movk x8, #12592, lsl #16
-; CHECK-NEXT:    ldp x9, x11, [x0, #16]
-; CHECK-NEXT:    movk x8, #13106, lsl #32
-; CHECK-NEXT:    movk x8, #13620, lsl #48
-; CHECK-NEXT:    ccmp x10, x8, #0, eq
-; CHECK-NEXT:    mov x8, #14134 // =0x3736
-; CHECK-NEXT:    movk x8, #14648, lsl #16
-; CHECK-NEXT:    movk x8, #12592, lsl #32
-; CHECK-NEXT:    movk x8, #13106, lsl #48
-; CHECK-NEXT:    ccmp x9, x8, #0, eq
-; CHECK-NEXT:    mov x8, #13620 // =0x3534
-; CHECK-NEXT:    movk x8, #14134, lsl #16
-; CHECK-NEXT:    movk x8, #14648, lsl #32
-; CHECK-NEXT:    movk x8, #12592, lsl #48
-; CHECK-NEXT:    ccmp x11, x8, #0, eq
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 32) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length48(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length48:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB67_7
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB67_7
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB67_7
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldr x8, [x0, #24]
-; CHECK-NEXT:    ldr x9, [x1, #24]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB67_7
-; CHECK-NEXT:  // %bb.4: // %loadbb4
-; CHECK-NEXT:    ldr x8, [x0, #32]
-; CHECK-NEXT:    ldr x9, [x1, #32]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB67_7
-; CHECK-NEXT:  // %bb.5: // %loadbb5
-; CHECK-NEXT:    ldr x8, [x0, #40]
-; CHECK-NEXT:    ldr x9, [x1, #40]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB67_7
-; CHECK-NEXT:  // %bb.6:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB67_7: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w0, w8, hs
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 48) nounwind
-  ret i32 %m
-}
-
-define i1 @length48_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length48_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    ldp x12, x13, [x1, #16]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ldp x8, x9, [x0, #16]
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    ccmp x8, x12, #0, eq
-; CHECK-NEXT:    ldp x8, x11, [x0, #32]
-; CHECK-NEXT:    ldp x10, x12, [x1, #32]
-; CHECK-NEXT:    ccmp x9, x13, #0, eq
-; CHECK-NEXT:    ccmp x8, x10, #0, eq
-; CHECK-NEXT:    ccmp x11, x12, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length48_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length48_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB69_7
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB69_7
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB69_7
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldr x8, [x0, #24]
-; CHECK-NEXT:    ldr x9, [x1, #24]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB69_7
-; CHECK-NEXT:  // %bb.4: // %loadbb4
-; CHECK-NEXT:    ldr x8, [x0, #32]
-; CHECK-NEXT:    ldr x9, [x1, #32]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB69_7
-; CHECK-NEXT:  // %bb.5: // %loadbb5
-; CHECK-NEXT:    ldr x8, [x0, #40]
-; CHECK-NEXT:    ldr x9, [x1, #40]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB69_7
-; CHECK-NEXT:  // %bb.6:
-; CHECK-NEXT:    lsr w0, wzr, #31
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB69_7: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:    lsr w0, w8, #31
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length48_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length48_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB70_7
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB70_7
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB70_7
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldr x8, [x0, #24]
-; CHECK-NEXT:    ldr x9, [x1, #24]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB70_7
-; CHECK-NEXT:  // %bb.4: // %loadbb4
-; CHECK-NEXT:    ldr x8, [x0, #32]
-; CHECK-NEXT:    ldr x9, [x1, #32]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB70_7
-; CHECK-NEXT:  // %bb.5: // %loadbb5
-; CHECK-NEXT:    ldr x8, [x0, #40]
-; CHECK-NEXT:    ldr x9, [x1, #40]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB70_7
-; CHECK-NEXT:  // %bb.6:
-; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    b .LBB70_8
-; CHECK-NEXT:  .LBB70_7: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:  .LBB70_8: // %endblock
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length48_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
-; CHECK-LABEL: length48_eq_prefer128:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    ldp x12, x13, [x1, #16]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ldp x8, x9, [x0, #16]
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    ccmp x8, x12, #0, eq
-; CHECK-NEXT:    ldp x8, x11, [x0, #32]
-; CHECK-NEXT:    ldp x10, x12, [x1, #32]
-; CHECK-NEXT:    ccmp x9, x13, #0, eq
-; CHECK-NEXT:    ccmp x8, x10, #0, eq
-; CHECK-NEXT:    ccmp x11, x12, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length48_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length48_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #12592 // =0x3130
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    movk x8, #13106, lsl #16
-; CHECK-NEXT:    ldp x11, x12, [x0, #16]
-; CHECK-NEXT:    movk x8, #13620, lsl #32
-; CHECK-NEXT:    movk x8, #14134, lsl #48
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    mov x9, #14648 // =0x3938
-; CHECK-NEXT:    movk x9, #12592, lsl #16
-; CHECK-NEXT:    movk x9, #13106, lsl #32
-; CHECK-NEXT:    movk x9, #13620, lsl #48
-; CHECK-NEXT:    ccmp x10, x9, #0, eq
-; CHECK-NEXT:    mov x9, #14134 // =0x3736
-; CHECK-NEXT:    movk x9, #14648, lsl #16
-; CHECK-NEXT:    movk x9, #12592, lsl #32
-; CHECK-NEXT:    movk x9, #13106, lsl #48
-; CHECK-NEXT:    ccmp x11, x9, #0, eq
-; CHECK-NEXT:    mov x9, #13620 // =0x3534
-; CHECK-NEXT:    movk x9, #14134, lsl #16
-; CHECK-NEXT:    ldp x10, x11, [x0, #32]
-; CHECK-NEXT:    movk x9, #14648, lsl #32
-; CHECK-NEXT:    movk x9, #12592, lsl #48
-; CHECK-NEXT:    ccmp x12, x9, #0, eq
-; CHECK-NEXT:    mov x9, #13106 // =0x3332
-; CHECK-NEXT:    movk x9, #13620, lsl #16
-; CHECK-NEXT:    movk x9, #14134, lsl #32
-; CHECK-NEXT:    movk x9, #14648, lsl #48
-; CHECK-NEXT:    ccmp x10, x9, #0, eq
-; CHECK-NEXT:    ccmp x11, x8, #0, eq
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 48) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length63(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length63:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB73_9
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB73_9
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB73_9
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldr x8, [x0, #24]
-; CHECK-NEXT:    ldr x9, [x1, #24]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB73_9
-; CHECK-NEXT:  // %bb.4: // %loadbb4
-; CHECK-NEXT:    ldr x8, [x0, #32]
-; CHECK-NEXT:    ldr x9, [x1, #32]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB73_9
-; CHECK-NEXT:  // %bb.5: // %loadbb5
-; CHECK-NEXT:    ldr x8, [x0, #40]
-; CHECK-NEXT:    ldr x9, [x1, #40]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB73_9
-; CHECK-NEXT:  // %bb.6: // %loadbb6
-; CHECK-NEXT:    ldr x8, [x0, #48]
-; CHECK-NEXT:    ldr x9, [x1, #48]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB73_9
-; CHECK-NEXT:  // %bb.7: // %loadbb7
-; CHECK-NEXT:    ldur x8, [x0, #55]
-; CHECK-NEXT:    ldur x9, [x1, #55]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB73_9
-; CHECK-NEXT:  // %bb.8:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB73_9: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w0, w8, hs
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 63) nounwind
-  ret i32 %m
-}
-
-define i1 @length63_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length63_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    ldp x12, x13, [x1, #16]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ldp x8, x9, [x0, #16]
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    ccmp x8, x12, #0, eq
-; CHECK-NEXT:    ldp x8, x11, [x0, #32]
-; CHECK-NEXT:    ldp x10, x12, [x1, #32]
-; CHECK-NEXT:    ccmp x9, x13, #0, eq
-; CHECK-NEXT:    ldr x9, [x0, #48]
-; CHECK-NEXT:    ldr x13, [x1, #48]
-; CHECK-NEXT:    ccmp x8, x10, #0, eq
-; CHECK-NEXT:    ldur x8, [x0, #55]
-; CHECK-NEXT:    ldur x10, [x1, #55]
-; CHECK-NEXT:    ccmp x11, x12, #0, eq
-; CHECK-NEXT:    ccmp x9, x13, #0, eq
-; CHECK-NEXT:    ccmp x8, x10, #0, eq
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length63_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length63_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB75_9
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB75_9
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB75_9
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldr x8, [x0, #24]
-; CHECK-NEXT:    ldr x9, [x1, #24]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB75_9
-; CHECK-NEXT:  // %bb.4: // %loadbb4
-; CHECK-NEXT:    ldr x8, [x0, #32]
-; CHECK-NEXT:    ldr x9, [x1, #32]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB75_9
-; CHECK-NEXT:  // %bb.5: // %loadbb5
-; CHECK-NEXT:    ldr x8, [x0, #40]
-; CHECK-NEXT:    ldr x9, [x1, #40]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB75_9
-; CHECK-NEXT:  // %bb.6: // %loadbb6
-; CHECK-NEXT:    ldr x8, [x0, #48]
-; CHECK-NEXT:    ldr x9, [x1, #48]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB75_9
-; CHECK-NEXT:  // %bb.7: // %loadbb7
-; CHECK-NEXT:    ldur x8, [x0, #55]
-; CHECK-NEXT:    ldur x9, [x1, #55]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB75_9
-; CHECK-NEXT:  // %bb.8:
-; CHECK-NEXT:    lsr w0, wzr, #31
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB75_9: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:    lsr w0, w8, #31
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length63_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length63_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB76_9
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB76_9
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB76_9
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldr x8, [x0, #24]
-; CHECK-NEXT:    ldr x9, [x1, #24]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB76_9
-; CHECK-NEXT:  // %bb.4: // %loadbb4
-; CHECK-NEXT:    ldr x8, [x0, #32]
-; CHECK-NEXT:    ldr x9, [x1, #32]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB76_9
-; CHECK-NEXT:  // %bb.5: // %loadbb5
-; CHECK-NEXT:    ldr x8, [x0, #40]
-; CHECK-NEXT:    ldr x9, [x1, #40]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB76_9
-; CHECK-NEXT:  // %bb.6: // %loadbb6
-; CHECK-NEXT:    ldr x8, [x0, #48]
-; CHECK-NEXT:    ldr x9, [x1, #48]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB76_9
-; CHECK-NEXT:  // %bb.7: // %loadbb7
-; CHECK-NEXT:    ldur x8, [x0, #55]
-; CHECK-NEXT:    ldur x9, [x1, #55]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB76_9
-; CHECK-NEXT:  // %bb.8:
-; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    b .LBB76_10
-; CHECK-NEXT:  .LBB76_9: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:  .LBB76_10: // %endblock
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length63_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length63_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #12592 // =0x3130
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    movk x8, #13106, lsl #16
-; CHECK-NEXT:    ldp x11, x12, [x0, #16]
-; CHECK-NEXT:    movk x8, #13620, lsl #32
-; CHECK-NEXT:    movk x8, #14134, lsl #48
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    mov x9, #14648 // =0x3938
-; CHECK-NEXT:    movk x9, #12592, lsl #16
-; CHECK-NEXT:    movk x9, #13106, lsl #32
-; CHECK-NEXT:    movk x9, #13620, lsl #48
-; CHECK-NEXT:    ccmp x10, x9, #0, eq
-; CHECK-NEXT:    mov x10, #14134 // =0x3736
-; CHECK-NEXT:    movk x10, #14648, lsl #16
-; CHECK-NEXT:    movk x10, #12592, lsl #32
-; CHECK-NEXT:    movk x10, #13106, lsl #48
-; CHECK-NEXT:    ccmp x11, x10, #0, eq
-; CHECK-NEXT:    mov x10, #13620 // =0x3534
-; CHECK-NEXT:    movk x10, #14134, lsl #16
-; CHECK-NEXT:    ldp x11, x13, [x0, #32]
-; CHECK-NEXT:    movk x10, #14648, lsl #32
-; CHECK-NEXT:    movk x10, #12592, lsl #48
-; CHECK-NEXT:    ccmp x12, x10, #0, eq
-; CHECK-NEXT:    mov x10, #13106 // =0x3332
-; CHECK-NEXT:    ldr x12, [x0, #48]
-; CHECK-NEXT:    movk x10, #13620, lsl #16
-; CHECK-NEXT:    movk x10, #14134, lsl #32
-; CHECK-NEXT:    movk x10, #14648, lsl #48
-; CHECK-NEXT:    ccmp x11, x10, #0, eq
-; CHECK-NEXT:    ldur x10, [x0, #55]
-; CHECK-NEXT:    ccmp x13, x8, #0, eq
-; CHECK-NEXT:    mov x8, #13877 // =0x3635
-; CHECK-NEXT:    movk x8, #14391, lsl #16
-; CHECK-NEXT:    ccmp x12, x9, #0, eq
-; CHECK-NEXT:    movk x8, #12345, lsl #32
-; CHECK-NEXT:    movk x8, #12849, lsl #48
-; CHECK-NEXT:    ccmp x10, x8, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 63) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length64(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB78_9
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB78_9
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB78_9
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldr x8, [x0, #24]
-; CHECK-NEXT:    ldr x9, [x1, #24]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB78_9
-; CHECK-NEXT:  // %bb.4: // %loadbb4
-; CHECK-NEXT:    ldr x8, [x0, #32]
-; CHECK-NEXT:    ldr x9, [x1, #32]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB78_9
-; CHECK-NEXT:  // %bb.5: // %loadbb5
-; CHECK-NEXT:    ldr x8, [x0, #40]
-; CHECK-NEXT:    ldr x9, [x1, #40]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB78_9
-; CHECK-NEXT:  // %bb.6: // %loadbb6
-; CHECK-NEXT:    ldr x8, [x0, #48]
-; CHECK-NEXT:    ldr x9, [x1, #48]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB78_9
-; CHECK-NEXT:  // %bb.7: // %loadbb7
-; CHECK-NEXT:    ldr x8, [x0, #56]
-; CHECK-NEXT:    ldr x9, [x1, #56]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB78_9
-; CHECK-NEXT:  // %bb.8:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB78_9: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w0, w8, hs
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 64) nounwind
-  ret i32 %m
-}
-
-define i1 @length64_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length64_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    ldp x12, x13, [x1, #16]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ldp x8, x9, [x0, #16]
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    ccmp x8, x12, #0, eq
-; CHECK-NEXT:    ldp x8, x11, [x0, #32]
-; CHECK-NEXT:    ldp x10, x12, [x1, #32]
-; CHECK-NEXT:    ccmp x9, x13, #0, eq
-; CHECK-NEXT:    ldp x9, x13, [x1, #48]
-; CHECK-NEXT:    ccmp x8, x10, #0, eq
-; CHECK-NEXT:    ldp x8, x10, [x0, #48]
-; CHECK-NEXT:    ccmp x11, x12, #0, eq
-; CHECK-NEXT:    ccmp x8, x9, #0, eq
-; CHECK-NEXT:    ccmp x10, x13, #0, eq
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length64_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB80_9
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB80_9
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB80_9
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldr x8, [x0, #24]
-; CHECK-NEXT:    ldr x9, [x1, #24]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB80_9
-; CHECK-NEXT:  // %bb.4: // %loadbb4
-; CHECK-NEXT:    ldr x8, [x0, #32]
-; CHECK-NEXT:    ldr x9, [x1, #32]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB80_9
-; CHECK-NEXT:  // %bb.5: // %loadbb5
-; CHECK-NEXT:    ldr x8, [x0, #40]
-; CHECK-NEXT:    ldr x9, [x1, #40]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB80_9
-; CHECK-NEXT:  // %bb.6: // %loadbb6
-; CHECK-NEXT:    ldr x8, [x0, #48]
-; CHECK-NEXT:    ldr x9, [x1, #48]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB80_9
-; CHECK-NEXT:  // %bb.7: // %loadbb7
-; CHECK-NEXT:    ldr x8, [x0, #56]
-; CHECK-NEXT:    ldr x9, [x1, #56]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB80_9
-; CHECK-NEXT:  // %bb.8:
-; CHECK-NEXT:    lsr w0, wzr, #31
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB80_9: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:    lsr w0, w8, #31
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length64_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB81_9
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB81_9
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB81_9
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldr x8, [x0, #24]
-; CHECK-NEXT:    ldr x9, [x1, #24]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB81_9
-; CHECK-NEXT:  // %bb.4: // %loadbb4
-; CHECK-NEXT:    ldr x8, [x0, #32]
-; CHECK-NEXT:    ldr x9, [x1, #32]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB81_9
-; CHECK-NEXT:  // %bb.5: // %loadbb5
-; CHECK-NEXT:    ldr x8, [x0, #40]
-; CHECK-NEXT:    ldr x9, [x1, #40]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB81_9
-; CHECK-NEXT:  // %bb.6: // %loadbb6
-; CHECK-NEXT:    ldr x8, [x0, #48]
-; CHECK-NEXT:    ldr x9, [x1, #48]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB81_9
-; CHECK-NEXT:  // %bb.7: // %loadbb7
-; CHECK-NEXT:    ldr x8, [x0, #56]
-; CHECK-NEXT:    ldr x9, [x1, #56]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB81_9
-; CHECK-NEXT:  // %bb.8:
-; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    b .LBB81_10
-; CHECK-NEXT:  .LBB81_9: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:  .LBB81_10: // %endblock
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length64_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #12592 // =0x3130
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    movk x8, #13106, lsl #16
-; CHECK-NEXT:    ldp x11, x12, [x0, #16]
-; CHECK-NEXT:    movk x8, #13620, lsl #32
-; CHECK-NEXT:    ldp x13, x14, [x0, #32]
-; CHECK-NEXT:    movk x8, #14134, lsl #48
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    mov x9, #14648 // =0x3938
-; CHECK-NEXT:    movk x9, #12592, lsl #16
-; CHECK-NEXT:    movk x9, #13106, lsl #32
-; CHECK-NEXT:    movk x9, #13620, lsl #48
-; CHECK-NEXT:    ccmp x10, x9, #0, eq
-; CHECK-NEXT:    mov x10, #14134 // =0x3736
-; CHECK-NEXT:    movk x10, #14648, lsl #16
-; CHECK-NEXT:    movk x10, #12592, lsl #32
-; CHECK-NEXT:    movk x10, #13106, lsl #48
-; CHECK-NEXT:    ccmp x11, x10, #0, eq
-; CHECK-NEXT:    mov x11, #13620 // =0x3534
-; CHECK-NEXT:    movk x11, #14134, lsl #16
-; CHECK-NEXT:    movk x11, #14648, lsl #32
-; CHECK-NEXT:    movk x11, #12592, lsl #48
-; CHECK-NEXT:    ccmp x12, x11, #0, eq
-; CHECK-NEXT:    mov x11, #13106 // =0x3332
-; CHECK-NEXT:    movk x11, #13620, lsl #16
-; CHECK-NEXT:    movk x11, #14134, lsl #32
-; CHECK-NEXT:    movk x11, #14648, lsl #48
-; CHECK-NEXT:    ccmp x13, x11, #0, eq
-; CHECK-NEXT:    ldp x11, x12, [x0, #48]
-; CHECK-NEXT:    ccmp x14, x8, #0, eq
-; CHECK-NEXT:    ccmp x11, x9, #0, eq
-; CHECK-NEXT:    ccmp x12, x10, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 64) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length96(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length96:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w2, #96 // =0x60
-; CHECK-NEXT:    b memcmp
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 96) nounwind
-  ret i32 %m
-}
-
-define i1 @length96_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length96_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #96 // =0x60
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length96_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length96_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #96 // =0x60
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    lsr w0, w0, #31
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length96_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length96_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #96 // =0x60
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length96_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length96_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    adrp x1, .L.str
-; CHECK-NEXT:    add x1, x1, :lo12:.L.str
-; CHECK-NEXT:    mov w2, #96 // =0x60
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 96) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length127(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length127:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w2, #127 // =0x7f
-; CHECK-NEXT:    b memcmp
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 127) nounwind
-  ret i32 %m
-}
-
-define i1 @length127_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length127_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #127 // =0x7f
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length127_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length127_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #127 // =0x7f
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    lsr w0, w0, #31
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length127_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length127_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #127 // =0x7f
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length127_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length127_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    adrp x1, .L.str
-; CHECK-NEXT:    add x1, x1, :lo12:.L.str
-; CHECK-NEXT:    mov w2, #127 // =0x7f
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 127) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length128(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length128:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w2, #128 // =0x80
-; CHECK-NEXT:    b memcmp
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 128) nounwind
-  ret i32 %m
-}
-
-define i1 @length128_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length128_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #128 // =0x80
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length128_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length128_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #128 // =0x80
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    lsr w0, w0, #31
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length128_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length128_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #128 // =0x80
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length128_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length128_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    adrp x1, .L.str
-; CHECK-NEXT:    add x1, x1, :lo12:.L.str
-; CHECK-NEXT:    mov w2, #128 // =0x80
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 128) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length192(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length192:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w2, #192 // =0xc0
-; CHECK-NEXT:    b memcmp
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 192) nounwind
-  ret i32 %m
-}
-
-define i1 @length192_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length192_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #192 // =0xc0
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length192_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length192_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #192 // =0xc0
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    lsr w0, w0, #31
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length192_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length192_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #192 // =0xc0
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length192_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length192_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    adrp x1, .L.str
-; CHECK-NEXT:    add x1, x1, :lo12:.L.str
-; CHECK-NEXT:    mov w2, #192 // =0xc0
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 192) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length255(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length255:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w2, #255 // =0xff
-; CHECK-NEXT:    b memcmp
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 255) nounwind
-  ret i32 %m
-}
-
-define i1 @length255_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length255_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #255 // =0xff
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length255_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length255_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #255 // =0xff
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    lsr w0, w0, #31
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length255_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length255_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #255 // =0xff
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length255_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length255_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    adrp x1, .L.str
-; CHECK-NEXT:    add x1, x1, :lo12:.L.str
-; CHECK-NEXT:    mov w2, #255 // =0xff
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 255) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length256(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length256:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w2, #256 // =0x100
-; CHECK-NEXT:    b memcmp
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 256) nounwind
-  ret i32 %m
-}
-
-define i1 @length256_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length256_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #256 // =0x100
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length256_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length256_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #256 // =0x100
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    lsr w0, w0, #31
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length256_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length256_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #256 // =0x100
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length256_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length256_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    adrp x1, .L.str
-; CHECK-NEXT:    add x1, x1, :lo12:.L.str
-; CHECK-NEXT:    mov w2, #256 // =0x100
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 256) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length384(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length384:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w2, #384 // =0x180
-; CHECK-NEXT:    b memcmp
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 384) nounwind
-  ret i32 %m
-}
-
-define i1 @length384_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length384_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #384 // =0x180
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length384_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length384_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #384 // =0x180
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    lsr w0, w0, #31
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length384_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length384_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #384 // =0x180
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length384_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length384_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    adrp x1, .L.str
-; CHECK-NEXT:    add x1, x1, :lo12:.L.str
-; CHECK-NEXT:    mov w2, #384 // =0x180
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 384) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length511(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length511:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w2, #511 // =0x1ff
-; CHECK-NEXT:    b memcmp
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 511) nounwind
-  ret i32 %m
-}
-
-define i1 @length511_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length511_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #511 // =0x1ff
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length511_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length511_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #511 // =0x1ff
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    lsr w0, w0, #31
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length511_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length511_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #511 // =0x1ff
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length511_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length511_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    adrp x1, .L.str
-; CHECK-NEXT:    add x1, x1, :lo12:.L.str
-; CHECK-NEXT:    mov w2, #511 // =0x1ff
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 511) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length512(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length512:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w2, #512 // =0x200
-; CHECK-NEXT:    b memcmp
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 512) nounwind
-  ret i32 %m
-}
-
-define i1 @length512_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length512_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #512 // =0x200
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length512_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length512_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #512 // =0x200
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    lsr w0, w0, #31
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length512_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length512_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #512 // =0x200
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length512_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length512_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    adrp x1, .L.str
-; CHECK-NEXT:    add x1, x1, :lo12:.L.str
-; CHECK-NEXT:    mov w2, #512 // =0x200
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 512) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @huge_length(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: huge_length:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x2, #9223372036854775807 // =0x7fffffffffffffff
-; CHECK-NEXT:    b memcmp
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9223372036854775807) nounwind
-  ret i32 %m
-}
-
-define i1 @huge_length_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: huge_length_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov x2, #9223372036854775807 // =0x7fffffffffffffff
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9223372036854775807) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @nonconst_length(ptr %X, ptr %Y, i64 %size) nounwind {
-; CHECK-LABEL: nonconst_length:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    b memcmp
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 %size) nounwind
-  ret i32 %m
-}
-
-define i1 @nonconst_length_eq(ptr %X, ptr %Y, i64 %size) nounwind {
-; CHECK-LABEL: nonconst_length_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 %size) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 8b0b6263832243..84210ec410d29f 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -203,13 +203,6 @@
 ; GCN-O1-NEXT:        Canonicalize Freeze Instructions in Loops
 ; GCN-O1-NEXT:        Induction Variable Users
 ; GCN-O1-NEXT:        Loop Strength Reduction
-; GCN-O1-NEXT:      Basic Alias Analysis (stateless AA impl)
-; GCN-O1-NEXT:      Function Alias Analysis Results
-; GCN-O1-NEXT:      Merge contiguous icmps into a memcmp
-; GCN-O1-NEXT:      Natural Loop Information
-; GCN-O1-NEXT:      Lazy Branch Probability Analysis
-; GCN-O1-NEXT:      Lazy Block Frequency Analysis
-; GCN-O1-NEXT:      Expand memcmp() to load/stores
 ; GCN-O1-NEXT:      Lower constant intrinsics
 ; GCN-O1-NEXT:      Remove unreachable blocks from the CFG
 ; GCN-O1-NEXT:      Natural Loop Information
@@ -484,13 +477,6 @@
 ; GCN-O1-OPTS-NEXT:        Canonicalize Freeze Instructions in Loops
 ; GCN-O1-OPTS-NEXT:        Induction Variable Users
 ; GCN-O1-OPTS-NEXT:        Loop Strength Reduction
-; GCN-O1-OPTS-NEXT:      Basic Alias Analysis (stateless AA impl)
-; GCN-O1-OPTS-NEXT:      Function Alias Analysis Results
-; GCN-O1-OPTS-NEXT:      Merge contiguous icmps into a memcmp
-; GCN-O1-OPTS-NEXT:      Natural Loop Information
-; GCN-O1-OPTS-NEXT:      Lazy Branch Probability Analysis
-; GCN-O1-OPTS-NEXT:      Lazy Block Frequency Analysis
-; GCN-O1-OPTS-NEXT:      Expand memcmp() to load/stores
 ; GCN-O1-OPTS-NEXT:      Lower constant intrinsics
 ; GCN-O1-OPTS-NEXT:      Remove unreachable blocks from the CFG
 ; GCN-O1-OPTS-NEXT:      Natural Loop Information
@@ -784,13 +770,6 @@
 ; GCN-O2-NEXT:        Canonicalize Freeze Instructions in Loops
 ; GCN-O2-NEXT:        Induction Variable Users
 ; GCN-O2-NEXT:        Loop Strength Reduction
-; GCN-O2-NEXT:      Basic Alias Analysis (stateless AA impl)
-; GCN-O2-NEXT:      Function Alias Analysis Results
-; GCN-O2-NEXT:      Merge contiguous icmps into a memcmp
-; GCN-O2-NEXT:      Natural Loop Information
-; GCN-O2-NEXT:      Lazy Branch Probability Analysis
-; GCN-O2-NEXT:      Lazy Block Frequency Analysis
-; GCN-O2-NEXT:      Expand memcmp() to load/stores
 ; GCN-O2-NEXT:      Lower constant intrinsics
 ; GCN-O2-NEXT:      Remove unreachable blocks from the CFG
 ; GCN-O2-NEXT:      Natural Loop Information
@@ -1092,13 +1071,6 @@
 ; GCN-O3-NEXT:        Canonicalize Freeze Instructions in Loops
 ; GCN-O3-NEXT:        Induction Variable Users
 ; GCN-O3-NEXT:        Loop Strength Reduction
-; GCN-O3-NEXT:      Basic Alias Analysis (stateless AA impl)
-; GCN-O3-NEXT:      Function Alias Analysis Results
-; GCN-O3-NEXT:      Merge contiguous icmps into a memcmp
-; GCN-O3-NEXT:      Natural Loop Information
-; GCN-O3-NEXT:      Lazy Branch Probability Analysis
-; GCN-O3-NEXT:      Lazy Block Frequency Analysis
-; GCN-O3-NEXT:      Expand memcmp() to load/stores
 ; GCN-O3-NEXT:      Lower constant intrinsics
 ; GCN-O3-NEXT:      Remove unreachable blocks from the CFG
 ; GCN-O3-NEXT:      Natural Loop Information
diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll
index 5e565970fc3a86..f2bef2c7e46acc 100644
--- a/llvm/test/CodeGen/ARM/O3-pipeline.ll
+++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll
@@ -21,13 +21,6 @@
 ; CHECK-NEXT:        Canonicalize Freeze Instructions in Loops
 ; CHECK-NEXT:        Induction Variable Users
 ; CHECK-NEXT:        Loop Strength Reduction
-; CHECK-NEXT:      Basic Alias Analysis (stateless AA impl)
-; CHECK-NEXT:      Function Alias Analysis Results
-; CHECK-NEXT:      Merge contiguous icmps into a memcmp
-; CHECK-NEXT:      Natural Loop Information
-; CHECK-NEXT:      Lazy Branch Probability Analysis
-; CHECK-NEXT:      Lazy Block Frequency Analysis
-; CHECK-NEXT:      Expand memcmp() to load/stores
 ; CHECK-NEXT:      Lower Garbage Collection Instructions
 ; CHECK-NEXT:      Shadow Stack GC Lowering
 ; CHECK-NEXT:      Lower constant intrinsics
diff --git a/llvm/test/CodeGen/BPF/memcmp.ll b/llvm/test/CodeGen/BPF/memcmp.ll
deleted file mode 100644
index 7ed8dc1e736f4c..00000000000000
--- a/llvm/test/CodeGen/BPF/memcmp.ll
+++ /dev/null
@@ -1,77 +0,0 @@
-; RUN: llc -march=bpfel < %s | FileCheck %s
-; RUN: llc -march=bpfel -mcpu=v3 < %s | FileCheck %s
-;
-; Source code:
-;   /* set aligned 4 to minimize the number of loads */
-;   struct build_id {
-;     unsigned char id[20];
-;   } __attribute__((aligned(4)));
-;
-;   /* try to compute a local build_id */
-;   void bar1(ptr);
-;
-;   /* the global build_id to compare */
-;   struct build_id id2;
-;
-;   int foo()
-;   {
-;     struct build_id id1;
-;
-;     bar1(&id1);
-;     return __builtin_memcmp(&id1, &id2, sizeof(id1)) == 0;
-;   }
-; Compilation flags:
-;   clang -target bpf -S -O2 t.c -emit-llvm
-
-
-%struct.build_id = type { [20 x i8] }
-
- at id2 = dso_local global %struct.build_id zeroinitializer, align 4
-
-; Function Attrs: nounwind
-define dso_local i32 @foo() local_unnamed_addr #0 {
-entry:
-  %id11 = alloca [20 x i8], align 4
-  call void @llvm.lifetime.start.p0(i64 20, ptr nonnull %id11) #4
-  call void @bar1(ptr noundef nonnull %id11) #4
-  %call = call i32 @memcmp(ptr noundef nonnull dereferenceable(20) %id11, ptr noundef nonnull dereferenceable(20) @id2, i64 noundef 20) #4
-  %cmp = icmp eq i32 %call, 0
-  %conv = zext i1 %cmp to i32
-  call void @llvm.lifetime.end.p0(i64 20, ptr nonnull %id11) #4
-  ret i32 %conv
-}
-
-; CHECK-DAG:   *(u32 *)(r1 + 0)
-; CHECK-DAG:   *(u32 *)(r1 + 4)
-; CHECK-DAG:   *(u32 *)(r10 - 16)
-; CHECK-DAG:   *(u32 *)(r10 - 20)
-; CHECK-DAG:   *(u32 *)(r10 - 8)
-; CHECK-DAG:   *(u32 *)(r10 - 12)
-; CHECK-DAG:   *(u32 *)(r1 + 8)
-; CHECK-DAG:   *(u32 *)(r1 + 12)
-; CHECK-DAG:   *(u32 *)(r2 + 16)
-; CHECK-DAG:   *(u32 *)(r10 - 4)
-
-; Function Attrs: argmemonly mustprogress nofree nosync nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
-
-declare dso_local void @bar1(ptr noundef) local_unnamed_addr #2
-
-; Function Attrs: argmemonly mustprogress nofree nounwind readonly willreturn
-declare dso_local i32 @memcmp(ptr nocapture noundef, ptr nocapture noundef, i64 noundef) local_unnamed_addr #3
-
-; Function Attrs: argmemonly mustprogress nofree nosync nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
-
-attributes #0 = { nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-attributes #1 = { argmemonly mustprogress nofree nosync nounwind willreturn }
-attributes #2 = { "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-attributes #3 = { argmemonly mustprogress nofree nounwind readonly willreturn "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-attributes #4 = { nounwind }
-
-!llvm.module.flags = !{!0, !1}
-!llvm.ident = !{!2}
-
-!0 = !{i32 1, !"wchar_size", i32 4}
-!1 = !{i32 7, !"frame-pointer", i32 2}
-!2 = !{!"clang version 15.0.0 (https://github.com/llvm/llvm-project.git dea65874b2505f8f5e8e51fd8cad6908feb375ec)"}
diff --git a/llvm/test/CodeGen/Generic/llc-start-stop.ll b/llvm/test/CodeGen/Generic/llc-start-stop.ll
index b02472473a00cb..9ada245835981b 100644
--- a/llvm/test/CodeGen/Generic/llc-start-stop.ll
+++ b/llvm/test/CodeGen/Generic/llc-start-stop.ll
@@ -19,15 +19,15 @@
 ; STOP-BEFORE-NOT: Loop Strength Reduction
 
 ; RUN: llc < %s -debug-pass=Structure -start-after=loop-reduce -o /dev/null 2>&1 | FileCheck %s -check-prefix=START-AFTER
-; START-AFTER: -aa -mergeicmps
+; START-AFTER: -gc-lowering
 ; START-AFTER: FunctionPass Manager
-; START-AFTER-NEXT: Dominator Tree Construction
+; START-AFTER-NEXT: Lower Garbage Collection Instructions
 
 ; RUN: llc < %s -debug-pass=Structure -start-before=loop-reduce -o /dev/null 2>&1 | FileCheck %s -check-prefix=START-BEFORE
 ; START-BEFORE: -machine-branch-prob -regalloc-evict -regalloc-priority -domtree
 ; START-BEFORE: FunctionPass Manager
 ; START-BEFORE: Loop Strength Reduction
-; START-BEFORE-NEXT: Basic Alias Analysis (stateless AA impl)
+; START-BEFORE-NEXT: Lower Garbage Collection Instructions
 
 ; RUN: not --crash llc < %s -start-before=nonexistent -o /dev/null 2>&1 | FileCheck %s -check-prefix=NONEXISTENT-START-BEFORE
 ; RUN: not --crash llc < %s -stop-before=nonexistent -o /dev/null 2>&1 | FileCheck %s -check-prefix=NONEXISTENT-STOP-BEFORE
diff --git a/llvm/test/CodeGen/LoongArch/opt-pipeline.ll b/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
index 3134d940545e80..696d8c8be017cb 100644
--- a/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
+++ b/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
@@ -23,8 +23,8 @@
 ; CHECK-NEXT: Type-Based Alias Analysis
 ; CHECK-NEXT: Scoped NoAlias Alias Analysis
 ; CHECK-NEXT: Assumption Cache Tracker
-; CHECK-NEXT: Profile summary info
 ; CHECK-NEXT: Create Garbage Collector Module Metadata
+; CHECK-NEXT: Profile summary info
 ; CHECK-NEXT: Machine Branch Probability Analysis
 ; CHECK-NEXT: Default Regalloc Eviction Advisor
 ; CHECK-NEXT: Default Regalloc Priority Advisor
@@ -44,13 +44,6 @@
 ; CHECK-NEXT:         Canonicalize Freeze Instructions in Loops
 ; CHECK-NEXT:         Induction Variable Users
 ; CHECK-NEXT:         Loop Strength Reduction
-; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
-; CHECK-NEXT:       Function Alias Analysis Results
-; CHECK-NEXT:       Merge contiguous icmps into a memcmp
-; CHECK-NEXT:       Natural Loop Information
-; CHECK-NEXT:       Lazy Branch Probability Analysis
-; CHECK-NEXT:       Lazy Block Frequency Analysis
-; CHECK-NEXT:       Expand memcmp() to load/stores
 ; CHECK-NEXT:       Lower Garbage Collection Instructions
 ; CHECK-NEXT:       Shadow Stack GC Lowering
 ; CHECK-NEXT:       Lower constant intrinsics
diff --git a/llvm/test/CodeGen/M68k/pipeline.ll b/llvm/test/CodeGen/M68k/pipeline.ll
index dfaa149b7a4744..ad053cf4d61a07 100644
--- a/llvm/test/CodeGen/M68k/pipeline.ll
+++ b/llvm/test/CodeGen/M68k/pipeline.ll
@@ -15,13 +15,6 @@
 ; CHECK-NEXT:        Canonicalize Freeze Instructions in Loops
 ; CHECK-NEXT:        Induction Variable Users
 ; CHECK-NEXT:        Loop Strength Reduction
-; CHECK-NEXT:      Basic Alias Analysis (stateless AA impl)
-; CHECK-NEXT:      Function Alias Analysis Results
-; CHECK-NEXT:      Merge contiguous icmps into a memcmp
-; CHECK-NEXT:      Natural Loop Information
-; CHECK-NEXT:      Lazy Branch Probability Analysis
-; CHECK-NEXT:      Lazy Block Frequency Analysis
-; CHECK-NEXT:      Expand memcmp() to load/stores
 ; CHECK-NEXT:      Lower Garbage Collection Instructions
 ; CHECK-NEXT:      Shadow Stack GC Lowering
 ; CHECK-NEXT:      Lower constant intrinsics
diff --git a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
index 6ce4416211cc4d..1fdb4802eff036 100644
--- a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
+++ b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
@@ -11,8 +11,8 @@
 ; CHECK-NEXT: Assumption Cache Tracker
 ; CHECK-NEXT: Type-Based Alias Analysis
 ; CHECK-NEXT: Scoped NoAlias Alias Analysis
-; CHECK-NEXT: Profile summary info
 ; CHECK-NEXT: Create Garbage Collector Module Metadata
+; CHECK-NEXT: Profile summary info
 ; CHECK-NEXT: Machine Branch Probability Analysis
 ; CHECK-NEXT: Default Regalloc Eviction Advisor
 ; CHECK-NEXT: Default Regalloc Priority Advisor
@@ -45,13 +45,6 @@
 ; CHECK-NEXT:         Canonicalize Freeze Instructions in Loops
 ; CHECK-NEXT:         Induction Variable Users
 ; CHECK-NEXT:         Loop Strength Reduction
-; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
-; CHECK-NEXT:       Function Alias Analysis Results
-; CHECK-NEXT:       Merge contiguous icmps into a memcmp
-; CHECK-NEXT:       Natural Loop Information
-; CHECK-NEXT:       Lazy Branch Probability Analysis
-; CHECK-NEXT:       Lazy Block Frequency Analysis
-; CHECK-NEXT:       Expand memcmp() to load/stores
 ; CHECK-NEXT:       Lower Garbage Collection Instructions
 ; CHECK-NEXT:       Shadow Stack GC Lowering
 ; CHECK-NEXT:       Lower constant intrinsics
diff --git a/llvm/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll b/llvm/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll
deleted file mode 100644
index 1da40d46aa7730..00000000000000
--- a/llvm/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll
+++ /dev/null
@@ -1,168 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -ppc-gpr-icmps=all -verify-machineinstrs -mcpu=pwr8 < %s | FileCheck %s
-target datalayout = "e-m:e-i64:64-n32:64"
-target triple = "powerpc64le-unknown-linux-gnu"
-
- at zeroEqualityTest01.buffer1 = private unnamed_addr constant [3 x i32] [i32 1, i32 2, i32 4], align 4
- at zeroEqualityTest01.buffer2 = private unnamed_addr constant [3 x i32] [i32 1, i32 2, i32 3], align 4
- at zeroEqualityTest02.buffer1 = private unnamed_addr constant [4 x i32] [i32 4, i32 0, i32 0, i32 0], align 4
- at zeroEqualityTest02.buffer2 = private unnamed_addr constant [4 x i32] [i32 3, i32 0, i32 0, i32 0], align 4
- at zeroEqualityTest03.buffer1 = private unnamed_addr constant [4 x i32] [i32 0, i32 0, i32 0, i32 3], align 4
- at zeroEqualityTest03.buffer2 = private unnamed_addr constant [4 x i32] [i32 0, i32 0, i32 0, i32 4], align 4
- at zeroEqualityTest04.buffer1 = private unnamed_addr constant [15 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14], align 4
- at zeroEqualityTest04.buffer2 = private unnamed_addr constant [15 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 13], align 4
-
-declare signext i32 @memcmp(ptr nocapture, ptr nocapture, i64) local_unnamed_addr #1
-
-; Check 4 bytes - requires 1 load for each param.
-define signext i32 @zeroEqualityTest02(ptr %x, ptr %y) {
-; CHECK-LABEL: zeroEqualityTest02:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lwz 3, 0(3)
-; CHECK-NEXT:    lwz 4, 0(4)
-; CHECK-NEXT:    xor 3, 3, 4
-; CHECK-NEXT:    cntlzw 3, 3
-; CHECK-NEXT:    srwi 3, 3, 5
-; CHECK-NEXT:    xori 3, 3, 1
-; CHECK-NEXT:    blr
-  %call = tail call signext i32 @memcmp(ptr %x, ptr %y, i64 4)
-  %not.cmp = icmp ne i32 %call, 0
-  %. = zext i1 %not.cmp to i32
-  ret i32 %.
-}
-
-; Check 16 bytes - requires 2 loads for each param (or use vectors?).
-define signext i32 @zeroEqualityTest01(ptr %x, ptr %y) {
-; CHECK-LABEL: zeroEqualityTest01:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    ld 5, 0(3)
-; CHECK-NEXT:    ld 6, 0(4)
-; CHECK-NEXT:    cmpld 5, 6
-; CHECK-NEXT:    bne 0, .LBB1_2
-; CHECK-NEXT:  # %bb.1: # %loadbb1
-; CHECK-NEXT:    ld 5, 8(3)
-; CHECK-NEXT:    ld 4, 8(4)
-; CHECK-NEXT:    li 3, 0
-; CHECK-NEXT:    cmpld 5, 4
-; CHECK-NEXT:    beqlr 0
-; CHECK-NEXT:  .LBB1_2: # %res_block
-; CHECK-NEXT:    li 3, 1
-; CHECK-NEXT:    blr
-  %call = tail call signext i32 @memcmp(ptr %x, ptr %y, i64 16)
-  %not.tobool = icmp ne i32 %call, 0
-  %. = zext i1 %not.tobool to i32
-  ret i32 %.
-}
-
-; Check 7 bytes - requires 3 loads for each param.
-define signext i32 @zeroEqualityTest03(ptr %x, ptr %y) {
-; CHECK-LABEL: zeroEqualityTest03:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lwz 5, 0(3)
-; CHECK-NEXT:    lwz 6, 0(4)
-; CHECK-NEXT:    cmplw 5, 6
-; CHECK-NEXT:    bne 0, .LBB2_3
-; CHECK-NEXT:  # %bb.1: # %loadbb1
-; CHECK-NEXT:    lhz 5, 4(3)
-; CHECK-NEXT:    lhz 6, 4(4)
-; CHECK-NEXT:    cmplw 5, 6
-; CHECK-NEXT:    bne 0, .LBB2_3
-; CHECK-NEXT:  # %bb.2: # %loadbb2
-; CHECK-NEXT:    lbz 5, 6(3)
-; CHECK-NEXT:    lbz 4, 6(4)
-; CHECK-NEXT:    li 3, 0
-; CHECK-NEXT:    cmplw 5, 4
-; CHECK-NEXT:    beqlr 0
-; CHECK-NEXT:  .LBB2_3: # %res_block
-; CHECK-NEXT:    li 3, 1
-; CHECK-NEXT:    blr
-  %call = tail call signext i32 @memcmp(ptr %x, ptr %y, i64 7)
-  %not.lnot = icmp ne i32 %call, 0
-  %cond = zext i1 %not.lnot to i32
-  ret i32 %cond
-}
-
-; Validate with > 0
-define signext i32 @zeroEqualityTest04() {
-; CHECK-LABEL: zeroEqualityTest04:
-; CHECK:       # %bb.0: # %loadbb
-; CHECK-NEXT:    li 3, 0
-; CHECK-NEXT:    blr
-  %call = tail call signext i32 @memcmp(ptr @zeroEqualityTest02.buffer1, ptr @zeroEqualityTest02.buffer2, i64 16)
-  %not.cmp = icmp slt i32 %call, 1
-  %. = zext i1 %not.cmp to i32
-  ret i32 %.
-}
-
-; Validate with < 0
-define signext i32 @zeroEqualityTest05() {
-; CHECK-LABEL: zeroEqualityTest05:
-; CHECK:       # %bb.0: # %loadbb
-; CHECK-NEXT:    li 3, 0
-; CHECK-NEXT:    blr
-  %call = tail call signext i32 @memcmp(ptr @zeroEqualityTest03.buffer1, ptr @zeroEqualityTest03.buffer2, i64 16)
-  %call.lobit = lshr i32 %call, 31
-  %call.lobit.not = xor i32 %call.lobit, 1
-  ret i32 %call.lobit.not
-}
-
-; Validate with memcmp()?:
-define signext i32 @equalityFoldTwoConstants() {
-; CHECK-LABEL: equalityFoldTwoConstants:
-; CHECK:       # %bb.0: # %loadbb
-; CHECK-NEXT:    li 3, 1
-; CHECK-NEXT:    blr
-  %call = tail call signext i32 @memcmp(ptr @zeroEqualityTest04.buffer1, ptr @zeroEqualityTest04.buffer2, i64 16)
-  %not.tobool = icmp eq i32 %call, 0
-  %cond = zext i1 %not.tobool to i32
-  ret i32 %cond
-}
-
-define signext i32 @equalityFoldOneConstant(ptr %X) {
-; CHECK-LABEL: equalityFoldOneConstant:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    li 5, 1
-; CHECK-NEXT:    ld 4, 0(3)
-; CHECK-NEXT:    rldic 5, 5, 32, 31
-; CHECK-NEXT:    cmpld 4, 5
-; CHECK-NEXT:    bne 0, .LBB6_2
-; CHECK-NEXT:  # %bb.1: # %loadbb1
-; CHECK-NEXT:    lis 5, -32768
-; CHECK-NEXT:    ld 4, 8(3)
-; CHECK-NEXT:    li 3, 0
-; CHECK-NEXT:    ori 5, 5, 1
-; CHECK-NEXT:    rldic 5, 5, 1, 30
-; CHECK-NEXT:    cmpld 4, 5
-; CHECK-NEXT:    beq 0, .LBB6_3
-; CHECK-NEXT:  .LBB6_2: # %res_block
-; CHECK-NEXT:    li 3, 1
-; CHECK-NEXT:  .LBB6_3: # %endblock
-; CHECK-NEXT:    cntlzw 3, 3
-; CHECK-NEXT:    srwi 3, 3, 5
-; CHECK-NEXT:    blr
-  %call = tail call signext i32 @memcmp(ptr @zeroEqualityTest04.buffer1, ptr %X, i64 16)
-  %not.tobool = icmp eq i32 %call, 0
-  %cond = zext i1 %not.tobool to i32
-  ret i32 %cond
-}
-
-define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length2_eq_nobuiltin_attr:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    mflr 0
-; CHECK-NEXT:    stdu 1, -32(1)
-; CHECK-NEXT:    li 5, 2
-; CHECK-NEXT:    std 0, 48(1)
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    cntlzw 3, 3
-; CHECK-NEXT:    rlwinm 3, 3, 27, 31, 31
-; CHECK-NEXT:    addi 1, 1, 32
-; CHECK-NEXT:    ld 0, 16(1)
-; CHECK-NEXT:    mtlr 0
-; CHECK-NEXT:    blr
-  %m = tail call signext i32 @memcmp(ptr %X, ptr %Y, i64 2) nobuiltin
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
diff --git a/llvm/test/CodeGen/PowerPC/memcmp-mergeexpand.ll b/llvm/test/CodeGen/PowerPC/memcmp-mergeexpand.ll
deleted file mode 100644
index 29910646c89371..00000000000000
--- a/llvm/test/CodeGen/PowerPC/memcmp-mergeexpand.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64le-unknown-gnu-linux  < %s | FileCheck %s -check-prefix=PPC64LE
-
-; This tests interaction between MergeICmp and expand-memcmp.
-
-%"struct.std::pair" = type { i32, i32 }
-
-define zeroext i1 @opeq1(
-; PPC64LE-LABEL: opeq1:
-; PPC64LE:       # %bb.0: # %"entry+land.rhs.i"
-; PPC64LE-NEXT:    ld 3, 0(3)
-; PPC64LE-NEXT:    ld 4, 0(4)
-; PPC64LE-NEXT:    cmpd 3, 4
-; PPC64LE-NEXT:    li 3, 0
-; PPC64LE-NEXT:    li 4, 1
-; PPC64LE-NEXT:    iseleq 3, 4, 3
-; PPC64LE-NEXT:    blr
-  ptr nocapture readonly dereferenceable(8) %a,
-  ptr nocapture readonly dereferenceable(8) %b) local_unnamed_addr #0 {
-entry:
-  %0 = load i32, ptr %a, align 4
-  %1 = load i32, ptr %b, align 4
-  %cmp.i = icmp eq i32 %0, %1
-  br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit
-
-land.rhs.i:
-  %second.i = getelementptr inbounds %"struct.std::pair", ptr %a, i64 0, i32 1
-  %2 = load i32, ptr %second.i, align 4
-  %second2.i = getelementptr inbounds %"struct.std::pair", ptr %b, i64 0, i32 1
-  %3 = load i32, ptr %second2.i, align 4
-  %cmp3.i = icmp eq i32 %2, %3
-  br label %opeq1.exit
-
-opeq1.exit:
-  %4 = phi i1 [ false, %entry ], [ %cmp3.i, %land.rhs.i ]
-  ret i1 %4
-}
-
-
diff --git a/llvm/test/CodeGen/PowerPC/memcmp.ll b/llvm/test/CodeGen/PowerPC/memcmp.ll
deleted file mode 100644
index 0634534b9c9df1..00000000000000
--- a/llvm/test/CodeGen/PowerPC/memcmp.ll
+++ /dev/null
@@ -1,62 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64le-unknown-gnu-linux  < %s | FileCheck %s -check-prefix=CHECK
-
-define signext i32 @memcmp8(ptr nocapture readonly %buffer1, ptr nocapture readonly %buffer2) {
-; CHECK-LABEL: memcmp8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    ldbrx 3, 0, 3
-; CHECK-NEXT:    ldbrx 4, 0, 4
-; CHECK-NEXT:    subc 5, 4, 3
-; CHECK-NEXT:    subfe 5, 4, 4
-; CHECK-NEXT:    subc 4, 3, 4
-; CHECK-NEXT:    subfe 3, 3, 3
-; CHECK-NEXT:    neg 5, 5
-; CHECK-NEXT:    neg 3, 3
-; CHECK-NEXT:    sub 3, 5, 3
-; CHECK-NEXT:    extsw 3, 3
-; CHECK-NEXT:    blr
-  %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 8)
-  ret i32 %call
-}
-
-define signext i32 @memcmp4(ptr nocapture readonly %buffer1, ptr nocapture readonly %buffer2) {
-; CHECK-LABEL: memcmp4:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lwbrx 3, 0, 3
-; CHECK-NEXT:    lwbrx 4, 0, 4
-; CHECK-NEXT:    sub 5, 4, 3
-; CHECK-NEXT:    sub 3, 3, 4
-; CHECK-NEXT:    rldicl 5, 5, 1, 63
-; CHECK-NEXT:    rldicl 3, 3, 1, 63
-; CHECK-NEXT:    sub 3, 5, 3
-; CHECK-NEXT:    extsw 3, 3
-; CHECK-NEXT:    blr
-  %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 4)
-  ret i32 %call
-}
-
-define signext i32 @memcmp2(ptr nocapture readonly %buffer1, ptr nocapture readonly %buffer2) {
-; CHECK-LABEL: memcmp2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lhbrx 3, 0, 3
-; CHECK-NEXT:    lhbrx 4, 0, 4
-; CHECK-NEXT:    sub 3, 3, 4
-; CHECK-NEXT:    extsw 3, 3
-; CHECK-NEXT:    blr
-  %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 2)
-  ret i32 %call
-}
-
-define signext i32 @memcmp1(ptr nocapture readonly %buffer1, ptr nocapture readonly %buffer2) {
-; CHECK-LABEL: memcmp1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lbz 3, 0(3)
-; CHECK-NEXT:    lbz 4, 0(4)
-; CHECK-NEXT:    sub 3, 3, 4
-; CHECK-NEXT:    extsw 3, 3
-; CHECK-NEXT:    blr
-  %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 1) #2
-  ret i32 %call
-}
-
-declare signext i32 @memcmp(ptr, ptr, i64)
diff --git a/llvm/test/CodeGen/PowerPC/memcmpIR.ll b/llvm/test/CodeGen/PowerPC/memcmpIR.ll
deleted file mode 100644
index 0a8bec7dc0e3f1..00000000000000
--- a/llvm/test/CodeGen/PowerPC/memcmpIR.ll
+++ /dev/null
@@ -1,178 +0,0 @@
-; RUN: llc -o - -mtriple=powerpc64le-unknown-gnu-linux -stop-after codegenprepare %s | FileCheck %s
-; RUN: llc -o - -mtriple=powerpc64-unknown-gnu-linux -stop-after codegenprepare %s | FileCheck %s --check-prefix=CHECK-BE
-
-define signext i32 @test1(ptr nocapture readonly %buffer1, ptr nocapture readonly %buffer2)  {
-entry:
-  ; CHECK-LABEL: @test1(
-  ; CHECK-LABEL: res_block:{{.*}}
-  ; CHECK: [[ICMP2:%[0-9]+]] = icmp ult i64
-  ; CHECK-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
-  ; CHECK-NEXT: br label %endblock
-
-  ; CHECK-LABEL: loadbb:{{.*}}
-  ; CHECK: [[LOAD1:%[0-9]+]] = load i64, ptr
-  ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, ptr
-  ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD1]])
-  ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD2]])
-  ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[BSWAP1]], [[BSWAP2]]
-  ; CHECK-NEXT:  br i1 [[ICMP]], label %loadbb1, label %res_block
-
-  ; CHECK-LABEL: loadbb1:{{.*}}
-  ; CHECK-NEXT: [[GEP1:%[0-9]+]] = getelementptr i8, ptr {{.*}}, i64 8
-  ; CHECK-NEXT: [[GEP2:%[0-9]+]] = getelementptr i8, ptr {{.*}}, i64 8
-  ; CHECK-NEXT: [[LOAD1:%[0-9]+]] = load i64, ptr [[GEP1]]
-  ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, ptr [[GEP2]]
-  ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD1]])
-  ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD2]])
-  ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[BSWAP1]], [[BSWAP2]]
-  ; CHECK-NEXT:  br i1 [[ICMP]], label %endblock, label %res_block
-
-  ; CHECK-BE-LABEL: @test1(
-  ; CHECK-BE-LABEL: res_block:{{.*}}
-  ; CHECK-BE: [[ICMP2:%[0-9]+]] = icmp ult i64
-  ; CHECK-BE-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
-  ; CHECK-BE-NEXT: br label %endblock
-
-  ; CHECK-BE-LABEL: loadbb:{{.*}}
-  ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i64, ptr
-  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, ptr
-  ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[LOAD1]], [[LOAD2]]
-  ; CHECK-BE-NEXT:  br i1 [[ICMP]], label %loadbb1, label %res_block
-
-  ; CHECK-BE-LABEL: loadbb1:{{.*}}
-  ; CHECK-BE-NEXT: [[GEP1:%[0-9]+]] = getelementptr i8, ptr {{.*}}, i64 8
-  ; CHECK-BE-NEXT: [[GEP2:%[0-9]+]] = getelementptr i8, ptr {{.*}}, i64 8
-  ; CHECK-BE-NEXT: [[LOAD1:%[0-9]+]] = load i64, ptr [[GEP1]]
-  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, ptr [[GEP2]]
-  ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[LOAD1]], [[LOAD2]]
-  ; CHECK-BE-NEXT:  br i1 [[ICMP]], label %endblock, label %res_block
-
-  %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 16)
-  ret i32 %call
-}
-
-declare signext i32 @memcmp(ptr nocapture, ptr nocapture, i64) local_unnamed_addr #1
-
-define signext i32 @test2(ptr nocapture readonly %buffer1, ptr nocapture readonly %buffer2)  {
-  ; CHECK-LABEL: @test2(
-  ; CHECK: [[LOAD1:%[0-9]+]] = load i32, ptr
-  ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i32, ptr
-  ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD1]])
-  ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD2]])
-  ; CHECK-NEXT: [[CMP1:%[0-9]+]] = icmp ugt i32 [[BSWAP1]], [[BSWAP2]]
-  ; CHECK-NEXT: [[CMP2:%[0-9]+]] = icmp ult i32 [[BSWAP1]], [[BSWAP2]]
-  ; CHECK-NEXT: [[Z1:%[0-9]+]] = zext i1 [[CMP1]] to i32
-  ; CHECK-NEXT: [[Z2:%[0-9]+]] = zext i1 [[CMP2]] to i32
-  ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i32 [[Z1]], [[Z2]]
-  ; CHECK-NEXT: ret i32 [[SUB]]
-
-  ; CHECK-BE-LABEL: @test2(
-  ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i32, ptr
-  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i32, ptr
-  ; CHECK-BE-NEXT: [[CMP1:%[0-9]+]] = icmp ugt i32 [[LOAD1]], [[LOAD2]]
-  ; CHECK-BE-NEXT: [[CMP2:%[0-9]+]] = icmp ult i32 [[LOAD1]], [[LOAD2]]
-  ; CHECK-BE-NEXT: [[Z1:%[0-9]+]] = zext i1 [[CMP1]] to i32
-  ; CHECK-BE-NEXT: [[Z2:%[0-9]+]] = zext i1 [[CMP2]] to i32
-  ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i32 [[Z1]], [[Z2]]
-  ; CHECK-BE-NEXT: ret i32 [[SUB]]
-
-entry:
-  %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 4)
-  ret i32 %call
-}
-
-define signext i32 @test3(ptr nocapture readonly %buffer1, ptr nocapture readonly %buffer2)  {
-  ; CHECK-LABEL: res_block:{{.*}}
-  ; CHECK: [[ICMP2:%[0-9]+]] = icmp ult i64
-  ; CHECK-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
-  ; CHECK-NEXT: br label %endblock
-
-  ; CHECK-LABEL: loadbb:{{.*}}
-  ; CHECK: [[LOAD1:%[0-9]+]] = load i64, ptr
-  ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, ptr
-  ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD1]])
-  ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD2]])
-  ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[BSWAP1]], [[BSWAP2]]
-  ; CHECK-NEXT:  br i1 [[ICMP]], label %loadbb1, label %res_block
-
-  ; CHECK-LABEL: loadbb1:{{.*}}
-  ; CHECK: [[LOAD1:%[0-9]+]] = load i32, ptr
-  ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i32, ptr
-  ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD1]])
-  ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD2]])
-  ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[BSWAP1]] to i64
-  ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[BSWAP2]] to i64
-  ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[ZEXT1]], [[ZEXT2]]
-  ; CHECK-NEXT:  br i1 [[ICMP]], label %loadbb2, label %res_block
-
-  ; CHECK-LABEL: loadbb2:{{.*}}
-  ; CHECK: [[LOAD1:%[0-9]+]] = load i16, ptr
-  ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i16, ptr
-  ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i16 @llvm.bswap.i16(i16 [[LOAD1]])
-  ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i16 @llvm.bswap.i16(i16 [[LOAD2]])
-  ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i16 [[BSWAP1]] to i64
-  ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i16 [[BSWAP2]] to i64
-  ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[ZEXT1]], [[ZEXT2]]
-  ; CHECK-NEXT:  br i1 [[ICMP]], label %loadbb3, label %res_block
-
-  ; CHECK-LABEL: loadbb3:{{.*}}
-  ; CHECK: [[LOAD1:%[0-9]+]] = load i8, ptr
-  ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i8, ptr
-  ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i8 [[LOAD1]] to i32
-  ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i8 [[LOAD2]] to i32
-  ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i32 [[ZEXT1]], [[ZEXT2]]
-  ; CHECK-NEXT:  br label %endblock
-
-  ; CHECK-BE-LABEL: res_block:{{.*}}
-  ; CHECK-BE: [[ICMP2:%[0-9]+]] = icmp ult i64
-  ; CHECK-BE-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
-  ; CHECK-BE-NEXT: br label %endblock
-
-  ; CHECK-BE-LABEL: loadbb:{{.*}}
-  ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i64, ptr
-  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, ptr
-  ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[LOAD1]], [[LOAD2]]
-  ; CHECK-BE-NEXT:  br i1 [[ICMP]], label %loadbb1, label %res_block
-
-  ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i32, ptr
-  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i32, ptr
-  ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[LOAD1]] to i64
-  ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[LOAD2]] to i64
-  ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[ZEXT1]], [[ZEXT2]]
-  ; CHECK-BE-NEXT:  br i1 [[ICMP]], label %loadbb2, label %res_block
-
-  ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i16, ptr
-  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i16, ptr
-  ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i16 [[LOAD1]] to i64
-  ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i16 [[LOAD2]] to i64
-  ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[ZEXT1]], [[ZEXT2]]
-  ; CHECK-BE-NEXT:  br i1 [[ICMP]], label %loadbb3, label %res_block
-
-  ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i8, ptr
-  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i8, ptr
-  ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i8 [[LOAD1]] to i32
-  ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i8 [[LOAD2]] to i32
-  ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i32 [[ZEXT1]], [[ZEXT2]]
-  ; CHECK-BE-NEXT:  br label %endblock
-
-entry:
-  %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 15)
-  ret i32 %call
-}
-  ; CHECK: call = tail call signext i32 @memcmp
-  ; CHECK-BE: call = tail call signext i32 @memcmp
-define signext i32 @test4(ptr nocapture readonly %buffer1, ptr nocapture readonly %buffer2)  {
-
-entry:
-  %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 65)
-  ret i32 %call
-}
-
-define signext i32 @test5(ptr nocapture readonly %buffer1, ptr nocapture readonly %buffer2, i32 signext %SIZE)  {
-  ; CHECK: call = tail call signext i32 @memcmp
-  ; CHECK-BE: call = tail call signext i32 @memcmp
-entry:
-  %conv = sext i32 %SIZE to i64
-  %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 %conv)
-  ret i32 %call
-}
diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
index e7db8ef9d5aff3..8b07c7015dcceb 100644
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -45,13 +45,6 @@
 ; CHECK-NEXT:         Canonicalize Freeze Instructions in Loops
 ; CHECK-NEXT:         Induction Variable Users
 ; CHECK-NEXT:         Loop Strength Reduction
-; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
-; CHECK-NEXT:       Function Alias Analysis Results
-; CHECK-NEXT:       Merge contiguous icmps into a memcmp
-; CHECK-NEXT:       Natural Loop Information
-; CHECK-NEXT:       Lazy Branch Probability Analysis
-; CHECK-NEXT:       Lazy Block Frequency Analysis
-; CHECK-NEXT:       Expand memcmp() to load/stores
 ; CHECK-NEXT:       Lower Garbage Collection Instructions
 ; CHECK-NEXT:       Shadow Stack GC Lowering
 ; CHECK-NEXT:       Lower constant intrinsics
@@ -193,7 +186,7 @@
 ; CHECK-NEXT:       Machine Optimization Remark Emitter
 ; CHECK-NEXT:       Stack Frame Layout Analysis
 ; CHECK-NEXT:       RISC-V Zcmp move merging pass
-; CHECK-NEXT:       RISC-V Zcmp Push/Pop optimization pass 
+; CHECK-NEXT:       RISC-V Zcmp Push/Pop optimization pass
 ; CHECK-NEXT:       RISC-V pseudo instruction expansion pass
 ; CHECK-NEXT:       RISC-V atomic pseudo instruction expansion pass
 ; CHECK-NEXT:       Unpack machine instruction bundles
diff --git a/llvm/test/CodeGen/X86/memcmp-mergeexpand.ll b/llvm/test/CodeGen/X86/memcmp-mergeexpand.ll
deleted file mode 100644
index c16e2adb7a0783..00000000000000
--- a/llvm/test/CodeGen/X86/memcmp-mergeexpand.ll
+++ /dev/null
@@ -1,49 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown               | FileCheck %s --check-prefix=X86
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown             | FileCheck %s --check-prefix=X64
-
-; This tests interaction between MergeICmp and ExpandMemCmp.
-
-%"struct.std::pair" = type { i32, i32 }
-
-define zeroext i1 @opeq1(
-; X86-LABEL: opeq1:
-; X86:       # %bb.0: # %"entry+land.rhs.i"
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %ecx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 4(%eax), %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
-; X64-LABEL: opeq1:
-; X64:       # %bb.0: # %"entry+land.rhs.i"
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    cmpq (%rsi), %rax
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  ptr nocapture readonly dereferenceable(8) %a,
-  ptr nocapture readonly dereferenceable(8) %b) local_unnamed_addr nofree nosync {
-entry:
-  %0 = load i32, ptr %a, align 4
-  %1 = load i32, ptr %b, align 4
-  %cmp.i = icmp eq i32 %0, %1
-  br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit
-
-land.rhs.i:
-  %second.i = getelementptr inbounds %"struct.std::pair", ptr %a, i64 0, i32 1
-  %2 = load i32, ptr %second.i, align 4
-  %second2.i = getelementptr inbounds %"struct.std::pair", ptr %b, i64 0, i32 1
-  %3 = load i32, ptr %second2.i, align 4
-  %cmp3.i = icmp eq i32 %2, %3
-  br label %opeq1.exit
-
-opeq1.exit:
-  %4 = phi i1 [ false, %entry ], [ %cmp3.i, %land.rhs.i ]
-  ret i1 %4
-}
-
-
diff --git a/llvm/test/CodeGen/X86/memcmp-minsize-x32.ll b/llvm/test/CodeGen/X86/memcmp-minsize-x32.ll
deleted file mode 100644
index ae1320f8b0868b..00000000000000
--- a/llvm/test/CodeGen/X86/memcmp-minsize-x32.ll
+++ /dev/null
@@ -1,445 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2
-
-; This tests codegen time inlining/optimization of memcmp
-; rdar://6480398
-
- at .str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1
-
-declare dso_local i32 @memcmp(ptr, ptr, i32)
-
-define i32 @length2(ptr %X, ptr %Y) nounwind minsize {
-; X86-LABEL: length2:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $2
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
-  ret i32 %m
-}
-
-define i1 @length2_eq(ptr %X, ptr %Y) nounwind minsize {
-; X86-LABEL: length2_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    cmpw (%eax), %cx
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_const(ptr %X) nounwind minsize {
-; X86-LABEL: length2_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpw $12849, (%eax) # imm = 0x3231
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i32 2) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind minsize {
-; X86-LABEL: length2_eq_nobuiltin_attr:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $2
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind nobuiltin
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length3(ptr %X, ptr %Y) nounwind minsize {
-; X86-LABEL: length3:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $3
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind
-  ret i32 %m
-}
-
-define i1 @length3_eq(ptr %X, ptr %Y) nounwind minsize {
-; X86-LABEL: length3_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $3
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length4(ptr %X, ptr %Y) nounwind minsize {
-; X86-LABEL: length4:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $4
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
-  ret i32 %m
-}
-
-define i1 @length4_eq(ptr %X, ptr %Y) nounwind minsize {
-; X86-LABEL: length4_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    cmpl (%eax), %ecx
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length4_eq_const(ptr %X) nounwind minsize {
-; X86-LABEL: length4_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl $875770417, (%eax) # imm = 0x34333231
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i32 4) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length5(ptr %X, ptr %Y) nounwind minsize {
-; X86-LABEL: length5:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $5
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
-  ret i32 %m
-}
-
-define i1 @length5_eq(ptr %X, ptr %Y) nounwind minsize {
-; X86-LABEL: length5_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $5
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length8(ptr %X, ptr %Y) nounwind minsize {
-; X86-LABEL: length8:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $8
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 8) nounwind
-  ret i32 %m
-}
-
-define i1 @length8_eq(ptr %X, ptr %Y) nounwind minsize {
-; X86-LABEL: length8_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $8
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 8) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length8_eq_const(ptr %X) nounwind minsize {
-; X86-LABEL: length8_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $8
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 8) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length12_eq(ptr %X, ptr %Y) nounwind minsize {
-; X86-LABEL: length12_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $12
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 12) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length12(ptr %X, ptr %Y) nounwind minsize {
-; X86-LABEL: length12:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $12
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 12) nounwind
-  ret i32 %m
-}
-
-; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
-
-define i32 @length16(ptr %X, ptr %Y) nounwind minsize {
-; X86-LABEL: length16:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $16
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 16) nounwind
-  ret i32 %m
-}
-
-define i1 @length16_eq(ptr %x, ptr %y) nounwind minsize {
-; X86-NOSSE-LABEL: length16_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $16
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length16_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
-; X86-SSE2-NEXT:    pmovmskb %xmm1, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 16) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_eq_const(ptr %X) nounwind minsize {
-; X86-NOSSE-LABEL: length16_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $16
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length16_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 16) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
-
-define i32 @length24(ptr %X, ptr %Y) nounwind minsize {
-; X86-LABEL: length24:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $24
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 24) nounwind
-  ret i32 %m
-}
-
-define i1 @length24_eq(ptr %x, ptr %y) nounwind minsize {
-; X86-LABEL: length24_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $24
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 24) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_eq_const(ptr %X) nounwind minsize {
-; X86-LABEL: length24_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $24
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 24) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length32(ptr %X, ptr %Y) nounwind minsize {
-; X86-LABEL: length32:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 32) nounwind
-  ret i32 %m
-}
-
-; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
-
-define i1 @length32_eq(ptr %x, ptr %y) nounwind minsize {
-; X86-LABEL: length32_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 32) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_eq_const(ptr %X) nounwind minsize {
-; X86-LABEL: length32_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 32) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length64(ptr %X, ptr %Y) nounwind minsize {
-; X86-LABEL: length64:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 64) nounwind
-  ret i32 %m
-}
-
-define i1 @length64_eq(ptr %x, ptr %y) nounwind minsize {
-; X86-LABEL: length64_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 64) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_eq_const(ptr %X) nounwind minsize {
-; X86-LABEL: length64_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 64) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
diff --git a/llvm/test/CodeGen/X86/memcmp-minsize.ll b/llvm/test/CodeGen/X86/memcmp-minsize.ll
deleted file mode 100644
index 544d1c49f26b99..00000000000000
--- a/llvm/test/CodeGen/X86/memcmp-minsize.ll
+++ /dev/null
@@ -1,433 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
-
-; This tests codegen time inlining/optimization of memcmp
-; rdar://6480398
-
- at .str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1
-
-declare dso_local i32 @memcmp(ptr, ptr, i64)
-
-define i32 @length2(ptr %X, ptr %Y) nounwind minsize {
-; X64-LABEL: length2:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq $2
-; X64-NEXT:    popq %rdx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
-  ret i32 %m
-}
-
-define i1 @length2_eq(ptr %X, ptr %Y) nounwind minsize {
-; X64-LABEL: length2_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    cmpw (%rsi), %ax
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_const(ptr %X) nounwind minsize {
-; X64-LABEL: length2_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    cmpw $12849, (%rdi) # imm = 0x3231
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind minsize {
-; X64-LABEL: length2_eq_nobuiltin_attr:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    pushq $2
-; X64-NEXT:    popq %rdx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind nobuiltin
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length3(ptr %X, ptr %Y) nounwind minsize {
-; X64-LABEL: length3:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq $3
-; X64-NEXT:    popq %rdx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
-  ret i32 %m
-}
-
-define i1 @length3_eq(ptr %X, ptr %Y) nounwind minsize {
-; X64-LABEL: length3_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    pushq $3
-; X64-NEXT:    popq %rdx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length4(ptr %X, ptr %Y) nounwind minsize {
-; X64-LABEL: length4:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq $4
-; X64-NEXT:    popq %rdx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
-  ret i32 %m
-}
-
-define i1 @length4_eq(ptr %X, ptr %Y) nounwind minsize {
-; X64-LABEL: length4_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    cmpl (%rsi), %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length4_eq_const(ptr %X) nounwind minsize {
-; X64-LABEL: length4_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    cmpl $875770417, (%rdi) # imm = 0x34333231
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i64 4) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length5(ptr %X, ptr %Y) nounwind minsize {
-; X64-LABEL: length5:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq $5
-; X64-NEXT:    popq %rdx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
-  ret i32 %m
-}
-
-define i1 @length5_eq(ptr %X, ptr %Y) nounwind minsize {
-; X64-LABEL: length5_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    pushq $5
-; X64-NEXT:    popq %rdx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length8(ptr %X, ptr %Y) nounwind minsize {
-; X64-LABEL: length8:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq $8
-; X64-NEXT:    popq %rdx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
-  ret i32 %m
-}
-
-define i1 @length8_eq(ptr %X, ptr %Y) nounwind minsize {
-; X64-LABEL: length8_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    cmpq (%rsi), %rax
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length8_eq_const(ptr %X) nounwind minsize {
-; X64-LABEL: length8_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    movabsq $3978425819141910832, %rax # imm = 0x3736353433323130
-; X64-NEXT:    cmpq %rax, (%rdi)
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 8) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length12_eq(ptr %X, ptr %Y) nounwind minsize {
-; X64-LABEL: length12_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    pushq $12
-; X64-NEXT:    popq %rdx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length12(ptr %X, ptr %Y) nounwind minsize {
-; X64-LABEL: length12:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq $12
-; X64-NEXT:    popq %rdx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
-  ret i32 %m
-}
-
-; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
-
-define i32 @length16(ptr %X, ptr %Y) nounwind minsize {
-;
-; X64-LABEL: length16:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq $16
-; X64-NEXT:    popq %rdx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 16) nounwind
-  ret i32 %m
-}
-
-define i1 @length16_eq(ptr %x, ptr %y) nounwind minsize {
-; X64-SSE2-LABEL: length16_eq:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm0
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
-; X64-SSE2-NEXT:    pmovmskb %xmm1, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX-LABEL: length16_eq:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    setne %al
-; X64-AVX-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_eq_const(ptr %X) nounwind minsize {
-; X64-SSE2-LABEL: length16_eq_const:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX-LABEL: length16_eq_const:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    sete %al
-; X64-AVX-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 16) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
-
-define i32 @length24(ptr %X, ptr %Y) nounwind minsize {
-; X64-LABEL: length24:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq $24
-; X64-NEXT:    popq %rdx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 24) nounwind
-  ret i32 %m
-}
-
-define i1 @length24_eq(ptr %x, ptr %y) nounwind minsize {
-; X64-LABEL: length24_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    pushq $24
-; X64-NEXT:    popq %rdx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_eq_const(ptr %X) nounwind minsize {
-; X64-LABEL: length24_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    pushq $24
-; X64-NEXT:    popq %rdx
-; X64-NEXT:    movl $.L.str, %esi
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 24) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length32(ptr %X, ptr %Y) nounwind minsize {
-; X64-LABEL: length32:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq $32
-; X64-NEXT:    popq %rdx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 32) nounwind
-  ret i32 %m
-}
-
-; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
-
-define i1 @length32_eq(ptr %x, ptr %y) nounwind minsize {
-; X64-SSE2-LABEL: length32_eq:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    pushq %rax
-; X64-SSE2-NEXT:    pushq $32
-; X64-SSE2-NEXT:    popq %rdx
-; X64-SSE2-NEXT:    callq memcmp
-; X64-SSE2-NEXT:    testl %eax, %eax
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    popq %rcx
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX1-LABEL: length32_eq:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vxorps (%rsi), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    sete %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length32_eq:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_eq_const(ptr %X) nounwind minsize {
-; X64-SSE2-LABEL: length32_eq_const:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    pushq %rax
-; X64-SSE2-NEXT:    pushq $32
-; X64-SSE2-NEXT:    popq %rdx
-; X64-SSE2-NEXT:    movl $.L.str, %esi
-; X64-SSE2-NEXT:    callq memcmp
-; X64-SSE2-NEXT:    testl %eax, %eax
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    popq %rcx
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX1-LABEL: length32_eq_const:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    setne %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length32_eq_const:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 32) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length64(ptr %X, ptr %Y) nounwind minsize {
-; X64-LABEL: length64:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq $64
-; X64-NEXT:    popq %rdx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 64) nounwind
-  ret i32 %m
-}
-
-define i1 @length64_eq(ptr %x, ptr %y) nounwind minsize {
-; X64-LABEL: length64_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    pushq $64
-; X64-NEXT:    popq %rdx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_eq_const(ptr %X) nounwind minsize {
-; X64-LABEL: length64_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    pushq $64
-; X64-NEXT:    popq %rdx
-; X64-NEXT:    movl $.L.str, %esi
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 64) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll
deleted file mode 100644
index 0253d131226083..00000000000000
--- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll
+++ /dev/null
@@ -1,2911 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; NOTE: This is a copy of llvm/test/CodeGen/X86/memcmp.ll with more load pairs. Please keep it that way.
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=i686-unknown-unknown -mattr=cmov     | FileCheck %s --check-prefixes=X86,X86-NOSSE
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=i686-unknown-unknown -mattr=+sse     | FileCheck %s --check-prefixes=X86,X86-SSE1
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=i686-unknown-unknown -mattr=+sse2    | FileCheck %s --check-prefixes=X86,X86-SSE2
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1  | FileCheck %s --check-prefixes=X86,X86-SSE41
-
-; This tests codegen time inlining/optimization of memcmp
-; rdar://6480398
-
- at .str = private constant [513 x i8] c"01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901\00", align 1
-
-declare dso_local i32 @memcmp(ptr, ptr, i32)
-
-define i32 @length0(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length0:
-; X86:       # %bb.0:
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    retl
-   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 0) nounwind
-   ret i32 %m
- }
-
-define i1 @length0_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length0_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movb $1, %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 0) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length0_lt(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length0_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 0) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length2(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length2:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    movzwl %dx, %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
-  ret i32 %m
-}
-
-define i1 @length2_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length2_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    cmpw (%eax), %cx
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length2_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    movzwl %dx, %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_gt(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length2_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %ax
-; X86-NEXT:    movzwl %cx, %ecx
-; X86-NEXT:    movzwl %ax, %eax
-; X86-NEXT:    subl %eax, %ecx
-; X86-NEXT:    testl %ecx, %ecx
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
-  %c = icmp sgt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_const(ptr %X) nounwind {
-; X86-LABEL: length2_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    cmpl $12849, %eax # imm = 0x3231
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 2) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length2_eq_nobuiltin_attr:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $2
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind nobuiltin
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length3(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length3:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    movzwl (%ecx), %esi
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    rolw $8, %si
-; X86-NEXT:    cmpw %si, %dx
-; X86-NEXT:    jne .LBB9_3
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 2(%eax), %eax
-; X86-NEXT:    movzbl 2(%ecx), %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB9_3: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpw %si, %dx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl $1, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind
-  ret i32 %m
-}
-
-define i1 @length3_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length3_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %edx
-; X86-NEXT:    xorw (%eax), %dx
-; X86-NEXT:    movzbl 2(%ecx), %ecx
-; X86-NEXT:    xorb 2(%eax), %cl
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    orw %dx, %ax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length4(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length4:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    seta %al
-; X86-NEXT:    sbbl $0, %eax
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
-  ret i32 %m
-}
-
-define i1 @length4_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length4_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    cmpl (%eax), %ecx
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length4_lt(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length4_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    movl (%eax), %eax
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    cmpl %eax, %ecx
-; X86-NEXT:    setb %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length4_gt(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length4_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    movl (%eax), %eax
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    cmpl %eax, %ecx
-; X86-NEXT:    seta %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
-  %c = icmp sgt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length4_eq_const(ptr %X) nounwind {
-; X86-LABEL: length4_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl $875770417, (%eax) # imm = 0x34333231
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 4) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length5(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length5:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    movl (%ecx), %esi
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    bswapl %esi
-; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    jne .LBB16_3
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 4(%eax), %eax
-; X86-NEXT:    movzbl 4(%ecx), %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB16_3: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl $1, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
-  ret i32 %m
-}
-
-define i1 @length5_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length5_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    movzbl 4(%ecx), %ecx
-; X86-NEXT:    xorb 4(%eax), %cl
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length5_lt(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length5_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    movl (%ecx), %esi
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    bswapl %esi
-; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    jne .LBB18_3
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 4(%eax), %eax
-; X86-NEXT:    movzbl 4(%ecx), %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    jmp .LBB18_2
-; X86-NEXT:  .LBB18_3: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl $1, %eax
-; X86-NEXT:  .LBB18_2: # %endblock
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length7(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length7:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB19_2
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 3(%esi), %ecx
-; X86-NEXT:    movl 3(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB19_3
-; X86-NEXT:  .LBB19_2: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl $1, %eax
-; X86-NEXT:  .LBB19_3: # %endblock
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 7) nounwind
-  ret i32 %m
-}
-
-define i1 @length7_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length7_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 3(%ecx), %ecx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 3(%eax), %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 7) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length7_lt(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length7_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB21_2
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 3(%esi), %ecx
-; X86-NEXT:    movl 3(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB21_3
-; X86-NEXT:  .LBB21_2: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl $1, %eax
-; X86-NEXT:  .LBB21_3: # %endblock
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 7) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length8(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length8:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB22_2
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 4(%esi), %ecx
-; X86-NEXT:    movl 4(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB22_3
-; X86-NEXT:  .LBB22_2: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl $1, %eax
-; X86-NEXT:  .LBB22_3: # %endblock
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 8) nounwind
-  ret i32 %m
-}
-
-define i1 @length8_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length8_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %ecx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 4(%eax), %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 8) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length8_eq_const(ptr %X) nounwind {
-; X86-LABEL: length8_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl $858927408, %ecx # imm = 0x33323130
-; X86-NEXT:    xorl (%eax), %ecx
-; X86-NEXT:    movl $926299444, %edx # imm = 0x37363534
-; X86-NEXT:    xorl 4(%eax), %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 8) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length9_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length9_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %esi
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 4(%eax), %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movzbl 8(%ecx), %ecx
-; X86-NEXT:    xorb 8(%eax), %cl
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 9) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length10_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length10_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %esi
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 4(%eax), %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movzwl 8(%ecx), %ecx
-; X86-NEXT:    xorw 8(%eax), %cx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 10) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length11_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length11_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %esi
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 4(%eax), %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movl 7(%ecx), %ecx
-; X86-NEXT:    xorl 7(%eax), %ecx
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    sete %al
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 11) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length12_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length12_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %esi
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 4(%eax), %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movl 8(%ecx), %ecx
-; X86-NEXT:    xorl 8(%eax), %ecx
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    setne %al
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 12) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length12(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length12:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB29_3
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 4(%esi), %ecx
-; X86-NEXT:    movl 4(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB29_3
-; X86-NEXT:  # %bb.2: # %loadbb2
-; X86-NEXT:    movl 8(%esi), %ecx
-; X86-NEXT:    movl 8(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB29_4
-; X86-NEXT:  .LBB29_3: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl $1, %eax
-; X86-NEXT:  .LBB29_4: # %endblock
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 12) nounwind
-  ret i32 %m
-}
-
-define i1 @length13_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length13_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl (%edx), %esi
-; X86-NEXT:    movl 4(%edx), %eax
-; X86-NEXT:    xorl (%ecx), %esi
-; X86-NEXT:    xorl 4(%ecx), %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    movl 8(%edx), %esi
-; X86-NEXT:    xorl 8(%ecx), %esi
-; X86-NEXT:    movzbl 12(%edx), %edx
-; X86-NEXT:    xorb 12(%ecx), %dl
-; X86-NEXT:    movzbl %dl, %ecx
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    sete %al
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 13) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length14_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length14_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl (%edx), %esi
-; X86-NEXT:    movl 4(%edx), %eax
-; X86-NEXT:    xorl (%ecx), %esi
-; X86-NEXT:    xorl 4(%ecx), %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    movl 8(%edx), %esi
-; X86-NEXT:    xorl 8(%ecx), %esi
-; X86-NEXT:    movzwl 12(%edx), %edx
-; X86-NEXT:    xorw 12(%ecx), %dx
-; X86-NEXT:    movzwl %dx, %ecx
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    sete %al
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 14) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length15_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length15_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl (%edx), %esi
-; X86-NEXT:    movl 4(%edx), %eax
-; X86-NEXT:    xorl (%ecx), %esi
-; X86-NEXT:    xorl 4(%ecx), %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    movl 8(%edx), %esi
-; X86-NEXT:    xorl 8(%ecx), %esi
-; X86-NEXT:    movl 11(%edx), %edx
-; X86-NEXT:    xorl 11(%ecx), %edx
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    sete %al
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 15) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
-
-define i32 @length16(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length16:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB33_4
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 4(%esi), %ecx
-; X86-NEXT:    movl 4(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB33_4
-; X86-NEXT:  # %bb.2: # %loadbb2
-; X86-NEXT:    movl 8(%esi), %ecx
-; X86-NEXT:    movl 8(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB33_4
-; X86-NEXT:  # %bb.3: # %loadbb3
-; X86-NEXT:    movl 12(%esi), %ecx
-; X86-NEXT:    movl 12(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB33_5
-; X86-NEXT:  .LBB33_4: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl $1, %eax
-; X86-NEXT:  .LBB33_5: # %endblock
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 16) nounwind
-  ret i32 %m
-}
-
-define i1 @length16_eq(ptr %x, ptr %y) nounwind {
-; X86-NOSSE-LABEL: length16_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl %esi
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NOSSE-NEXT:    movl (%edx), %esi
-; X86-NOSSE-NEXT:    movl 4(%edx), %eax
-; X86-NOSSE-NEXT:    xorl (%ecx), %esi
-; X86-NOSSE-NEXT:    xorl 4(%ecx), %eax
-; X86-NOSSE-NEXT:    orl %esi, %eax
-; X86-NOSSE-NEXT:    movl 8(%edx), %esi
-; X86-NOSSE-NEXT:    xorl 8(%ecx), %esi
-; X86-NOSSE-NEXT:    movl 12(%edx), %edx
-; X86-NOSSE-NEXT:    xorl 12(%ecx), %edx
-; X86-NOSSE-NEXT:    orl %esi, %edx
-; X86-NOSSE-NEXT:    orl %eax, %edx
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    popl %esi
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length16_eq:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl %esi
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE1-NEXT:    movl (%edx), %esi
-; X86-SSE1-NEXT:    movl 4(%edx), %eax
-; X86-SSE1-NEXT:    xorl (%ecx), %esi
-; X86-SSE1-NEXT:    xorl 4(%ecx), %eax
-; X86-SSE1-NEXT:    orl %esi, %eax
-; X86-SSE1-NEXT:    movl 8(%edx), %esi
-; X86-SSE1-NEXT:    xorl 8(%ecx), %esi
-; X86-SSE1-NEXT:    movl 12(%edx), %edx
-; X86-SSE1-NEXT:    xorl 12(%ecx), %edx
-; X86-SSE1-NEXT:    orl %esi, %edx
-; X86-SSE1-NEXT:    orl %eax, %edx
-; X86-SSE1-NEXT:    setne %al
-; X86-SSE1-NEXT:    popl %esi
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length16_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
-; X86-SSE2-NEXT:    pmovmskb %xmm1, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length16_eq:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm1
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm1
-; X86-SSE41-NEXT:    ptest %xmm1, %xmm1
-; X86-SSE41-NEXT:    setne %al
-; X86-SSE41-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 16) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length16_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB35_4
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 4(%esi), %ecx
-; X86-NEXT:    movl 4(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB35_4
-; X86-NEXT:  # %bb.2: # %loadbb2
-; X86-NEXT:    movl 8(%esi), %ecx
-; X86-NEXT:    movl 8(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB35_4
-; X86-NEXT:  # %bb.3: # %loadbb3
-; X86-NEXT:    movl 12(%esi), %ecx
-; X86-NEXT:    movl 12(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB35_5
-; X86-NEXT:  .LBB35_4: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl $1, %eax
-; X86-NEXT:  .LBB35_5: # %endblock
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 16) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length16_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %eax
-; X86-NEXT:    movl (%edx), %ecx
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    cmpl %ecx, %eax
-; X86-NEXT:    jne .LBB36_4
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 4(%esi), %eax
-; X86-NEXT:    movl 4(%edx), %ecx
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    cmpl %ecx, %eax
-; X86-NEXT:    jne .LBB36_4
-; X86-NEXT:  # %bb.2: # %loadbb2
-; X86-NEXT:    movl 8(%esi), %eax
-; X86-NEXT:    movl 8(%edx), %ecx
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    cmpl %ecx, %eax
-; X86-NEXT:    jne .LBB36_4
-; X86-NEXT:  # %bb.3: # %loadbb3
-; X86-NEXT:    movl 12(%esi), %eax
-; X86-NEXT:    movl 12(%edx), %ecx
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    cmpl %ecx, %eax
-; X86-NEXT:    je .LBB36_5
-; X86-NEXT:  .LBB36_4: # %res_block
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    cmpl %ecx, %eax
-; X86-NEXT:    sbbl %edx, %edx
-; X86-NEXT:    orl $1, %edx
-; X86-NEXT:  .LBB36_5: # %endblock
-; X86-NEXT:    testl %edx, %edx
-; X86-NEXT:    setg %al
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 16) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_eq_const(ptr %X) nounwind {
-; X86-NOSSE-LABEL: length16_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl %esi
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl $858927408, %ecx # imm = 0x33323130
-; X86-NOSSE-NEXT:    xorl (%eax), %ecx
-; X86-NOSSE-NEXT:    movl $926299444, %edx # imm = 0x37363534
-; X86-NOSSE-NEXT:    xorl 4(%eax), %edx
-; X86-NOSSE-NEXT:    orl %ecx, %edx
-; X86-NOSSE-NEXT:    movl $825243960, %ecx # imm = 0x31303938
-; X86-NOSSE-NEXT:    xorl 8(%eax), %ecx
-; X86-NOSSE-NEXT:    movl $892613426, %esi # imm = 0x35343332
-; X86-NOSSE-NEXT:    xorl 12(%eax), %esi
-; X86-NOSSE-NEXT:    orl %ecx, %esi
-; X86-NOSSE-NEXT:    orl %edx, %esi
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    popl %esi
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length16_eq_const:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl %esi
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE1-NEXT:    movl $858927408, %ecx # imm = 0x33323130
-; X86-SSE1-NEXT:    xorl (%eax), %ecx
-; X86-SSE1-NEXT:    movl $926299444, %edx # imm = 0x37363534
-; X86-SSE1-NEXT:    xorl 4(%eax), %edx
-; X86-SSE1-NEXT:    orl %ecx, %edx
-; X86-SSE1-NEXT:    movl $825243960, %ecx # imm = 0x31303938
-; X86-SSE1-NEXT:    xorl 8(%eax), %ecx
-; X86-SSE1-NEXT:    movl $892613426, %esi # imm = 0x35343332
-; X86-SSE1-NEXT:    xorl 12(%eax), %esi
-; X86-SSE1-NEXT:    orl %ecx, %esi
-; X86-SSE1-NEXT:    orl %edx, %esi
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    popl %esi
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length16_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length16_eq_const:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 16) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
-
-define i32 @length24(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length24:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $24
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 24) nounwind
-  ret i32 %m
-}
-
-define i1 @length24_eq(ptr %x, ptr %y) nounwind {
-; X86-NOSSE-LABEL: length24_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $24
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length24_eq:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $24
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length24_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 8(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length24_eq:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 8(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 8(%eax), %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 24) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length24_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $24
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 24) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length24_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $24
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 24) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_eq_const(ptr %X) nounwind {
-; X86-NOSSE-LABEL: length24_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $24
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length24_eq_const:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $24
-; X86-SSE1-NEXT:    pushl $.L.str
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    setne %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length24_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length24_eq_const:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE41-NEXT:    movdqu 8(%eax), %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    setne %al
-; X86-SSE41-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 24) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length31(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length31:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $31
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 31) nounwind
-  ret i32 %m
-}
-
-define i1 @length31_eq(ptr %x, ptr %y) nounwind {
-; X86-NOSSE-LABEL: length31_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $31
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length31_eq:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $31
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length31_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 15(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 15(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length31_eq:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 15(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 15(%eax), %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 31) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length31_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length31_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $31
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 31) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length31_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length31_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $31
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 31) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length31_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
-; X86-NOSSE-LABEL: length31_eq_prefer128:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $31
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length31_eq_prefer128:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $31
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length31_eq_prefer128:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 15(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 15(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length31_eq_prefer128:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 15(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 15(%eax), %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 31) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length31_eq_const(ptr %X) nounwind {
-; X86-NOSSE-LABEL: length31_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $31
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length31_eq_const:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $31
-; X86-SSE1-NEXT:    pushl $.L.str
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    setne %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length31_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    movdqu 15(%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length31_eq_const:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE41-NEXT:    movdqu 15(%eax), %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    setne %al
-; X86-SSE41-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 31) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length32(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length32:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 32) nounwind
-  ret i32 %m
-}
-
-; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
-
-define i1 @length32_eq(ptr %x, ptr %y) nounwind {
-; X86-NOSSE-LABEL: length32_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $32
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length32_eq:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $32
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length32_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length32_eq:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 32) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length32_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 32) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length32_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 32) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
-; X86-NOSSE-LABEL: length32_eq_prefer128:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $32
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length32_eq_prefer128:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $32
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length32_eq_prefer128:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length32_eq_prefer128:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 32) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_eq_const(ptr %X) nounwind {
-; X86-NOSSE-LABEL: length32_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $32
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length32_eq_const:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $32
-; X86-SSE1-NEXT:    pushl $.L.str
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    setne %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length32_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length32_eq_const:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    setne %al
-; X86-SSE41-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 32) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length48(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length48:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $48
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 48) nounwind
-  ret i32 %m
-}
-
-define i1 @length48_eq(ptr %x, ptr %y) nounwind {
-; X86-NOSSE-LABEL: length48_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $48
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length48_eq:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $48
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length48_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    movdqu 32(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm2
-; X86-SSE2-NEXT:    pand %xmm0, %xmm2
-; X86-SSE2-NEXT:    pmovmskb %xmm2, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length48_eq:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    movdqu 32(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm2
-; X86-SSE41-NEXT:    por %xmm0, %xmm2
-; X86-SSE41-NEXT:    ptest %xmm2, %xmm2
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 48) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length48_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length48_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $48
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 48) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length48_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length48_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $48
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 48) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length48_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
-; X86-NOSSE-LABEL: length48_eq_prefer128:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $48
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length48_eq_prefer128:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $48
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length48_eq_prefer128:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    movdqu 32(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm2
-; X86-SSE2-NEXT:    pand %xmm0, %xmm2
-; X86-SSE2-NEXT:    pmovmskb %xmm2, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length48_eq_prefer128:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    movdqu 32(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm2
-; X86-SSE41-NEXT:    por %xmm0, %xmm2
-; X86-SSE41-NEXT:    ptest %xmm2, %xmm2
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 48) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length48_eq_const(ptr %X) nounwind {
-; X86-NOSSE-LABEL: length48_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $48
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length48_eq_const:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $48
-; X86-SSE1-NEXT:    pushl $.L.str
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    setne %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length48_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE2-NEXT:    pand %xmm0, %xmm2
-; X86-SSE2-NEXT:    pmovmskb %xmm2, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length48_eq_const:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm2
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE41-NEXT:    por %xmm0, %xmm2
-; X86-SSE41-NEXT:    ptest %xmm2, %xmm2
-; X86-SSE41-NEXT:    setne %al
-; X86-SSE41-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 48) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length63(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length63:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $63
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 63) nounwind
-  ret i32 %m
-}
-
-define i1 @length63_eq(ptr %x, ptr %y) nounwind {
-; X86-NOSSE-LABEL: length63_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $63
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length63_eq:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $63
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    setne %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length63_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    movdqu 32(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm2
-; X86-SSE2-NEXT:    movdqu 47(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu 47(%eax), %xmm3
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm3
-; X86-SSE2-NEXT:    pand %xmm2, %xmm3
-; X86-SSE2-NEXT:    pand %xmm0, %xmm3
-; X86-SSE2-NEXT:    pmovmskb %xmm3, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length63_eq:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    movdqu 32(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm2
-; X86-SSE41-NEXT:    movdqu 47(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu 47(%eax), %xmm3
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm3
-; X86-SSE41-NEXT:    por %xmm2, %xmm3
-; X86-SSE41-NEXT:    por %xmm0, %xmm3
-; X86-SSE41-NEXT:    ptest %xmm3, %xmm3
-; X86-SSE41-NEXT:    setne %al
-; X86-SSE41-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 63) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length63_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length63_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $63
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 63) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length63_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length63_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $63
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 63) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length63_eq_const(ptr %X) nounwind {
-; X86-NOSSE-LABEL: length63_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $63
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length63_eq_const:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $63
-; X86-SSE1-NEXT:    pushl $.L.str
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length63_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm2
-; X86-SSE2-NEXT:    movdqu 47(%eax), %xmm3
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE2-NEXT:    pand %xmm3, %xmm2
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length63_eq_const:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm2
-; X86-SSE41-NEXT:    movdqu 47(%eax), %xmm3
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE41-NEXT:    por %xmm3, %xmm2
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 63) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length64(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length64:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 64) nounwind
-  ret i32 %m
-}
-
-define i1 @length64_eq(ptr %x, ptr %y) nounwind {
-; X86-NOSSE-LABEL: length64_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $64
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length64_eq:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $64
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    setne %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length64_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    movdqu 32(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm2
-; X86-SSE2-NEXT:    movdqu 48(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu 48(%eax), %xmm3
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm3
-; X86-SSE2-NEXT:    pand %xmm2, %xmm3
-; X86-SSE2-NEXT:    pand %xmm0, %xmm3
-; X86-SSE2-NEXT:    pmovmskb %xmm3, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length64_eq:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    movdqu 32(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm2
-; X86-SSE41-NEXT:    movdqu 48(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu 48(%eax), %xmm3
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm3
-; X86-SSE41-NEXT:    por %xmm2, %xmm3
-; X86-SSE41-NEXT:    por %xmm0, %xmm3
-; X86-SSE41-NEXT:    ptest %xmm3, %xmm3
-; X86-SSE41-NEXT:    setne %al
-; X86-SSE41-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 64) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length64_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 64) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length64_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 64) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_eq_const(ptr %X) nounwind {
-; X86-NOSSE-LABEL: length64_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $64
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length64_eq_const:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $64
-; X86-SSE1-NEXT:    pushl $.L.str
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length64_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm2
-; X86-SSE2-NEXT:    movdqu 48(%eax), %xmm3
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE2-NEXT:    pand %xmm3, %xmm2
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length64_eq_const:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm2
-; X86-SSE41-NEXT:    movdqu 48(%eax), %xmm3
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE41-NEXT:    por %xmm3, %xmm2
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 64) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length96(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length96:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $96
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 96) nounwind
-  ret i32 %m
-}
-
-define i1 @length96_eq(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length96_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $96
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 96) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length96_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length96_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $96
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 96) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length96_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length96_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $96
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 96) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length96_eq_const(ptr %X) nounwind {
-; X86-LABEL: length96_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $96
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 96) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length127(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length127:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $127
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 127) nounwind
-  ret i32 %m
-}
-
-define i1 @length127_eq(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length127_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $127
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 127) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length127_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length127_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $127
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 127) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length127_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length127_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $127
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 127) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length127_eq_const(ptr %X) nounwind {
-; X86-LABEL: length127_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $127
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 127) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length128(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length128:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $128
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 128) nounwind
-  ret i32 %m
-}
-
-define i1 @length128_eq(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length128_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $128
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 128) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length128_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length128_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $128
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 128) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length128_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length128_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $128
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 128) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length128_eq_const(ptr %X) nounwind {
-; X86-LABEL: length128_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $128
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 128) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length192(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length192:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $192
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 192) nounwind
-  ret i32 %m
-}
-
-define i1 @length192_eq(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length192_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $192
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 192) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length192_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length192_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $192
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 192) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length192_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length192_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $192
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 192) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length192_eq_const(ptr %X) nounwind {
-; X86-LABEL: length192_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $192
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 192) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length255(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length255:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $255
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 255) nounwind
-  ret i32 %m
-}
-
-define i1 @length255_eq(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length255_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $255
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 255) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length255_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length255_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $255
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 255) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length255_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length255_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $255
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 255) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length255_eq_const(ptr %X) nounwind {
-; X86-LABEL: length255_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $255
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 255) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length256(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length256:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $256 # imm = 0x100
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 256) nounwind
-  ret i32 %m
-}
-
-define i1 @length256_eq(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length256_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $256 # imm = 0x100
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 256) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length256_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length256_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $256 # imm = 0x100
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 256) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length256_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length256_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $256 # imm = 0x100
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 256) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length256_eq_const(ptr %X) nounwind {
-; X86-LABEL: length256_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $256 # imm = 0x100
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 256) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length384(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length384:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $384 # imm = 0x180
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 384) nounwind
-  ret i32 %m
-}
-
-define i1 @length384_eq(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length384_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $384 # imm = 0x180
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 384) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length384_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length384_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $384 # imm = 0x180
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 384) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length384_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length384_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $384 # imm = 0x180
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 384) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length384_eq_const(ptr %X) nounwind {
-; X86-LABEL: length384_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $384 # imm = 0x180
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 384) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length511(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length511:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $511 # imm = 0x1FF
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 511) nounwind
-  ret i32 %m
-}
-
-define i1 @length511_eq(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length511_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $511 # imm = 0x1FF
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 511) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length511_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length511_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $511 # imm = 0x1FF
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 511) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length511_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length511_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $511 # imm = 0x1FF
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 511) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length511_eq_const(ptr %X) nounwind {
-; X86-LABEL: length511_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $511 # imm = 0x1FF
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 511) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length512(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length512:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $512 # imm = 0x200
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 512) nounwind
-  ret i32 %m
-}
-
-define i1 @length512_eq(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length512_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $512 # imm = 0x200
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 512) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length512_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length512_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $512 # imm = 0x200
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 512) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length512_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length512_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $512 # imm = 0x200
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 512) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length512_eq_const(ptr %X) nounwind {
-; X86-LABEL: length512_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $512 # imm = 0x200
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 512) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-; This checks that we do not do stupid things with huge sizes.
-define i32 @huge_length(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: huge_length:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $-1
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 9223372036854775807) nounwind
-  ret i32 %m
-}
-
-define i1 @huge_length_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: huge_length_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $-1
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 9223372036854775807) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-; This checks non-constant sizes.
-define i32 @nonconst_length(ptr %X, ptr %Y, i32 %size) nounwind {
-; X86-LABEL: nonconst_length:
-; X86:       # %bb.0:
-; X86-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 %size) nounwind
-  ret i32 %m
-}
-
-define i1 @nonconst_length_eq(ptr %X, ptr %Y, i32 %size) nounwind {
-; X86-LABEL: nonconst_length_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 %size) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
deleted file mode 100644
index 6eb02bfc1fd0c3..00000000000000
--- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
+++ /dev/null
@@ -1,4006 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; NOTE: This is a copy of llvm/test/CodeGen/X86/memcmp.ll with more load pairs. Please keep it that way.
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown               | FileCheck %s --check-prefixes=X64,X64-SSE,X64-SSE2
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1 | FileCheck %s --check-prefixes=X64,X64-SSE,X64-SSE41
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown -mattr=avx    | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown -mattr=avx2   | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw,+prefer-256-bit | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw,-prefer-256-bit | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX512,X64-AVX512BW
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f,+prefer-256-bit,-prefer-mask-registers | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f,-prefer-256-bit,-prefer-mask-registers | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX512,X64-AVX512F
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f,+prefer-256-bit,+prefer-mask-registers | FileCheck %s --check-prefixes=X64,X64-MIC-AVX,X64-MIC-AVX2
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f,-prefer-256-bit,+prefer-mask-registers | FileCheck %s --check-prefixes=X64,X64-MIC-AVX,X64-MIC-AVX512F
-
-; This tests codegen time inlining/optimization of memcmp
-; rdar://6480398
-
- at .str = private constant [513 x i8] c"01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901\00", align 1
-
-declare dso_local i32 @memcmp(ptr, ptr, i64)
-
-define i32 @length0(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length0:
-; X64:       # %bb.0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    retq
-   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
-   ret i32 %m
- }
-
-define i1 @length0_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length0_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movb $1, %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length0_lt(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length0_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length2(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length2:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    movzwl (%rsi), %ecx
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    rolw $8, %cx
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    movzwl %cx, %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
-  ret i32 %m
-}
-
-define i1 @length2_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length2_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    cmpw (%rsi), %ax
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length2_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    movzwl (%rsi), %ecx
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    rolw $8, %cx
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    movzwl %cx, %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_gt(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length2_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    movzwl (%rsi), %ecx
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    rolw $8, %cx
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    movzwl %cx, %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
-  %c = icmp sgt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_const(ptr %X) nounwind {
-; X64-LABEL: length2_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    cmpl $12849, %eax # imm = 0x3231
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length2_eq_nobuiltin_attr:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $2, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind nobuiltin
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length3(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length3:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %ecx
-; X64-NEXT:    movzwl (%rsi), %edx
-; X64-NEXT:    rolw $8, %cx
-; X64-NEXT:    rolw $8, %dx
-; X64-NEXT:    cmpw %dx, %cx
-; X64-NEXT:    jne .LBB9_3
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movzbl 2(%rdi), %eax
-; X64-NEXT:    movzbl 2(%rsi), %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB9_3: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpw %dx, %cx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
-  ret i32 %m
-}
-
-define i1 @length3_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length3_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    xorw (%rsi), %ax
-; X64-NEXT:    movzbl 2(%rdi), %ecx
-; X64-NEXT:    xorb 2(%rsi), %cl
-; X64-NEXT:    movzbl %cl, %ecx
-; X64-NEXT:    orw %ax, %cx
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length4(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length4:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %ecx
-; X64-NEXT:    movl (%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    seta %al
-; X64-NEXT:    sbbl $0, %eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
-  ret i32 %m
-}
-
-define i1 @length4_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length4_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    cmpl (%rsi), %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length4_lt(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length4_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    movl (%rsi), %ecx
-; X64-NEXT:    bswapl %eax
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    cmpl %ecx, %eax
-; X64-NEXT:    setb %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length4_gt(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length4_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    movl (%rsi), %ecx
-; X64-NEXT:    bswapl %eax
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    cmpl %ecx, %eax
-; X64-NEXT:    seta %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
-  %c = icmp sgt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length4_eq_const(ptr %X) nounwind {
-; X64-LABEL: length4_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    cmpl $875770417, (%rdi) # imm = 0x34333231
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 4) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length5(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length5:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %ecx
-; X64-NEXT:    movl (%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    jne .LBB16_3
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movzbl 4(%rdi), %eax
-; X64-NEXT:    movzbl 4(%rsi), %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB16_3: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
-  ret i32 %m
-}
-
-define i1 @length5_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length5_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    xorl (%rsi), %eax
-; X64-NEXT:    movzbl 4(%rdi), %ecx
-; X64-NEXT:    xorb 4(%rsi), %cl
-; X64-NEXT:    movzbl %cl, %ecx
-; X64-NEXT:    orl %eax, %ecx
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length5_lt(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length5_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %ecx
-; X64-NEXT:    movl (%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    jne .LBB18_3
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movzbl 4(%rdi), %eax
-; X64-NEXT:    movzbl 4(%rsi), %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB18_3: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length7(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length7:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %ecx
-; X64-NEXT:    movl (%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    jne .LBB19_2
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movl 3(%rdi), %ecx
-; X64-NEXT:    movl 3(%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    je .LBB19_3
-; X64-NEXT:  .LBB19_2: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB19_3: # %endblock
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
-  ret i32 %m
-}
-
-define i1 @length7_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length7_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    movl 3(%rdi), %ecx
-; X64-NEXT:    xorl (%rsi), %eax
-; X64-NEXT:    xorl 3(%rsi), %ecx
-; X64-NEXT:    orl %eax, %ecx
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length7_lt(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length7_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %ecx
-; X64-NEXT:    movl (%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    jne .LBB21_2
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movl 3(%rdi), %ecx
-; X64-NEXT:    movl 3(%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    je .LBB21_3
-; X64-NEXT:  .LBB21_2: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB21_3: # %endblock
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length8(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length8:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    seta %al
-; X64-NEXT:    sbbl $0, %eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
-  ret i32 %m
-}
-
-define i1 @length8_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length8_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    cmpq (%rsi), %rax
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length8_eq_const(ptr %X) nounwind {
-; X64-LABEL: length8_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    movabsq $3978425819141910832, %rax # imm = 0x3736353433323130
-; X64-NEXT:    cmpq %rax, (%rdi)
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 8) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length9_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length9_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    xorq (%rsi), %rax
-; X64-NEXT:    movzbl 8(%rdi), %ecx
-; X64-NEXT:    xorb 8(%rsi), %cl
-; X64-NEXT:    movzbl %cl, %ecx
-; X64-NEXT:    orq %rax, %rcx
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length10_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length10_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    xorq (%rsi), %rax
-; X64-NEXT:    movzwl 8(%rdi), %ecx
-; X64-NEXT:    xorw 8(%rsi), %cx
-; X64-NEXT:    movzwl %cx, %ecx
-; X64-NEXT:    orq %rax, %rcx
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 10) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length11_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length11_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    movq 3(%rdi), %rcx
-; X64-NEXT:    xorq (%rsi), %rax
-; X64-NEXT:    xorq 3(%rsi), %rcx
-; X64-NEXT:    orq %rax, %rcx
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 11) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length12_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length12_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    xorq (%rsi), %rax
-; X64-NEXT:    movl 8(%rdi), %ecx
-; X64-NEXT:    xorl 8(%rsi), %ecx
-; X64-NEXT:    orq %rax, %rcx
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length12(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length12:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB29_2
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movl 8(%rdi), %ecx
-; X64-NEXT:    movl 8(%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB29_3
-; X64-NEXT:  .LBB29_2: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB29_3: # %endblock
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
-  ret i32 %m
-}
-
-define i1 @length13_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length13_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    movq 5(%rdi), %rcx
-; X64-NEXT:    xorq (%rsi), %rax
-; X64-NEXT:    xorq 5(%rsi), %rcx
-; X64-NEXT:    orq %rax, %rcx
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 13) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length14_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length14_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    movq 6(%rdi), %rcx
-; X64-NEXT:    xorq (%rsi), %rax
-; X64-NEXT:    xorq 6(%rsi), %rcx
-; X64-NEXT:    orq %rax, %rcx
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 14) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length15_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length15_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    movq 7(%rdi), %rcx
-; X64-NEXT:    xorq (%rsi), %rax
-; X64-NEXT:    xorq 7(%rsi), %rcx
-; X64-NEXT:    orq %rax, %rcx
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
-
-define i32 @length16(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length16:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB33_2
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rcx
-; X64-NEXT:    movq 8(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB33_3
-; X64-NEXT:  .LBB33_2: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB33_3: # %endblock
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 16) nounwind
-  ret i32 %m
-}
-
-define i1 @length16_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE2-LABEL: length16_eq:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
-; X64-SSE2-NEXT:    pmovmskb %xmm1, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length16_eq:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu (%rsi), %xmm1
-; X64-SSE41-NEXT:    pxor %xmm0, %xmm1
-; X64-SSE41-NEXT:    ptest %xmm1, %xmm1
-; X64-SSE41-NEXT:    setne %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX-LABEL: length16_eq:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    setne %al
-; X64-AVX-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length16_eq:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %xmm1
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k0
-; X64-MIC-AVX-NEXT:    setne %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length16_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB35_2
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rcx
-; X64-NEXT:    movq 8(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB35_3
-; X64-NEXT:  .LBB35_2: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB35_3: # %endblock
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length16_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    movq (%rsi), %rcx
-; X64-NEXT:    bswapq %rax
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    jne .LBB36_2
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rax
-; X64-NEXT:    movq 8(%rsi), %rcx
-; X64-NEXT:    bswapq %rax
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    je .LBB36_3
-; X64-NEXT:  .LBB36_2: # %res_block
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    sbbl %edx, %edx
-; X64-NEXT:    orl $1, %edx
-; X64-NEXT:  .LBB36_3: # %endblock
-; X64-NEXT:    testl %edx, %edx
-; X64-NEXT:    setg %al
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_eq_const(ptr %X) nounwind {
-; X64-SSE2-LABEL: length16_eq_const:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length16_eq_const:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X64-SSE41-NEXT:    sete %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX-LABEL: length16_eq_const:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    sete %al
-; X64-AVX-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length16_eq_const:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [858927408,926299444,825243960,892613426]
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k0
-; X64-MIC-AVX-NEXT:    sete %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 16) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
-
-define i32 @length24(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length24:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB38_3
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rcx
-; X64-NEXT:    movq 8(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB38_3
-; X64-NEXT:  # %bb.2: # %loadbb2
-; X64-NEXT:    movq 16(%rdi), %rcx
-; X64-NEXT:    movq 16(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB38_4
-; X64-NEXT:  .LBB38_3: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB38_4: # %endblock
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 24) nounwind
-  ret i32 %m
-}
-
-define i1 @length24_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE2-LABEL: length24_eq:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
-; X64-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X64-SSE2-NEXT:    pand %xmm1, %xmm2
-; X64-SSE2-NEXT:    pmovmskb %xmm2, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length24_eq:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu (%rsi), %xmm1
-; X64-SSE41-NEXT:    pxor %xmm0, %xmm1
-; X64-SSE41-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-SSE41-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
-; X64-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X64-SSE41-NEXT:    por %xmm1, %xmm2
-; X64-SSE41-NEXT:    ptest %xmm2, %xmm2
-; X64-SSE41-NEXT:    sete %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX-LABEL: length24_eq:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; X64-AVX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; X64-AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
-; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
-; X64-AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    sete %al
-; X64-AVX-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length24_eq:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %xmm1
-; X64-MIC-AVX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; X64-MIC-AVX-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm2, %k0
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX-NEXT:    sete %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length24_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB40_3
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rcx
-; X64-NEXT:    movq 8(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB40_3
-; X64-NEXT:  # %bb.2: # %loadbb2
-; X64-NEXT:    movq 16(%rdi), %rcx
-; X64-NEXT:    movq 16(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB40_4
-; X64-NEXT:  .LBB40_3: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB40_4: # %endblock
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length24_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    movq (%rsi), %rcx
-; X64-NEXT:    bswapq %rax
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    jne .LBB41_3
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rax
-; X64-NEXT:    movq 8(%rsi), %rcx
-; X64-NEXT:    bswapq %rax
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    jne .LBB41_3
-; X64-NEXT:  # %bb.2: # %loadbb2
-; X64-NEXT:    movq 16(%rdi), %rax
-; X64-NEXT:    movq 16(%rsi), %rcx
-; X64-NEXT:    bswapq %rax
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    je .LBB41_4
-; X64-NEXT:  .LBB41_3: # %res_block
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    sbbl %edx, %edx
-; X64-NEXT:    orl $1, %edx
-; X64-NEXT:  .LBB41_4: # %endblock
-; X64-NEXT:    testl %edx, %edx
-; X64-NEXT:    setg %al
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_eq_const(ptr %X) nounwind {
-; X64-SSE2-LABEL: length24_eq_const:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT:    pand %xmm1, %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length24_eq_const:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE41-NEXT:    por %xmm1, %xmm0
-; X64-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X64-SSE41-NEXT:    setne %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX-LABEL: length24_eq_const:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    setne %al
-; X64-AVX-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length24_eq_const:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-MIC-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [959985462,858927408,0,0]
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm1, %k0
-; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [858927408,926299444,825243960,892613426]
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX-NEXT:    setne %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 24) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length31(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length31:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB43_4
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rcx
-; X64-NEXT:    movq 8(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB43_4
-; X64-NEXT:  # %bb.2: # %loadbb2
-; X64-NEXT:    movq 16(%rdi), %rcx
-; X64-NEXT:    movq 16(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB43_4
-; X64-NEXT:  # %bb.3: # %loadbb3
-; X64-NEXT:    movq 23(%rdi), %rcx
-; X64-NEXT:    movq 23(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB43_5
-; X64-NEXT:  .LBB43_4: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB43_5: # %endblock
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 31) nounwind
-  ret i32 %m
-}
-
-define i1 @length31_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE2-LABEL: length31_eq:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 15(%rdi), %xmm1
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm2
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X64-SSE2-NEXT:    movdqu 15(%rsi), %xmm0
-; X64-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X64-SSE2-NEXT:    pand %xmm2, %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length31_eq:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu 15(%rdi), %xmm1
-; X64-SSE41-NEXT:    movdqu (%rsi), %xmm2
-; X64-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X64-SSE41-NEXT:    movdqu 15(%rsi), %xmm0
-; X64-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X64-SSE41-NEXT:    por %xmm2, %xmm0
-; X64-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X64-SSE41-NEXT:    sete %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX-LABEL: length31_eq:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
-; X64-AVX-NEXT:    vpxor 15(%rsi), %xmm1, %xmm1
-; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
-; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    sete %al
-; X64-AVX-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length31_eq:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-MIC-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
-; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %xmm2
-; X64-MIC-AVX-NEXT:    vmovdqu 15(%rsi), %xmm3
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm1, %k0
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm0, %k1
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX-NEXT:    sete %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length31_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length31_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB45_4
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rcx
-; X64-NEXT:    movq 8(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB45_4
-; X64-NEXT:  # %bb.2: # %loadbb2
-; X64-NEXT:    movq 16(%rdi), %rcx
-; X64-NEXT:    movq 16(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB45_4
-; X64-NEXT:  # %bb.3: # %loadbb3
-; X64-NEXT:    movq 23(%rdi), %rcx
-; X64-NEXT:    movq 23(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB45_5
-; X64-NEXT:  .LBB45_4: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB45_5: # %endblock
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length31_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length31_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    movq (%rsi), %rcx
-; X64-NEXT:    bswapq %rax
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    jne .LBB46_4
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rax
-; X64-NEXT:    movq 8(%rsi), %rcx
-; X64-NEXT:    bswapq %rax
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    jne .LBB46_4
-; X64-NEXT:  # %bb.2: # %loadbb2
-; X64-NEXT:    movq 16(%rdi), %rax
-; X64-NEXT:    movq 16(%rsi), %rcx
-; X64-NEXT:    bswapq %rax
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    jne .LBB46_4
-; X64-NEXT:  # %bb.3: # %loadbb3
-; X64-NEXT:    movq 23(%rdi), %rax
-; X64-NEXT:    movq 23(%rsi), %rcx
-; X64-NEXT:    bswapq %rax
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    je .LBB46_5
-; X64-NEXT:  .LBB46_4: # %res_block
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    sbbl %edx, %edx
-; X64-NEXT:    orl $1, %edx
-; X64-NEXT:  .LBB46_5: # %endblock
-; X64-NEXT:    testl %edx, %edx
-; X64-NEXT:    setg %al
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length31_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
-; X64-SSE2-LABEL: length31_eq_prefer128:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 15(%rdi), %xmm1
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm2
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X64-SSE2-NEXT:    movdqu 15(%rsi), %xmm0
-; X64-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X64-SSE2-NEXT:    pand %xmm2, %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length31_eq_prefer128:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu 15(%rdi), %xmm1
-; X64-SSE41-NEXT:    movdqu (%rsi), %xmm2
-; X64-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X64-SSE41-NEXT:    movdqu 15(%rsi), %xmm0
-; X64-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X64-SSE41-NEXT:    por %xmm2, %xmm0
-; X64-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X64-SSE41-NEXT:    sete %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX-LABEL: length31_eq_prefer128:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
-; X64-AVX-NEXT:    vpxor 15(%rsi), %xmm1, %xmm1
-; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
-; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    sete %al
-; X64-AVX-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length31_eq_prefer128:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-MIC-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
-; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %xmm2
-; X64-MIC-AVX-NEXT:    vmovdqu 15(%rsi), %xmm3
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm1, %k0
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm0, %k1
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX-NEXT:    sete %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length31_eq_const(ptr %X) nounwind {
-; X64-SSE2-LABEL: length31_eq_const:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 15(%rdi), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT:    pand %xmm1, %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length31_eq_const:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu 15(%rdi), %xmm1
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE41-NEXT:    por %xmm1, %xmm0
-; X64-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X64-SSE41-NEXT:    setne %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX-LABEL: length31_eq_const:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
-; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    setne %al
-; X64-AVX-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length31_eq_const:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-MIC-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
-; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [943142453,842084409,909456435,809056311]
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm1, %k0
-; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [858927408,926299444,825243960,892613426]
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX-NEXT:    setne %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 31) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length32(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length32:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB49_4
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rcx
-; X64-NEXT:    movq 8(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB49_4
-; X64-NEXT:  # %bb.2: # %loadbb2
-; X64-NEXT:    movq 16(%rdi), %rcx
-; X64-NEXT:    movq 16(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB49_4
-; X64-NEXT:  # %bb.3: # %loadbb3
-; X64-NEXT:    movq 24(%rdi), %rcx
-; X64-NEXT:    movq 24(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB49_5
-; X64-NEXT:  .LBB49_4: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB49_5: # %endblock
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 32) nounwind
-  ret i32 %m
-}
-
-; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
-
-define i1 @length32_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE2-LABEL: length32_eq:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm2
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X64-SSE2-NEXT:    movdqu 16(%rsi), %xmm0
-; X64-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X64-SSE2-NEXT:    pand %xmm2, %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length32_eq:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE41-NEXT:    movdqu (%rsi), %xmm2
-; X64-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X64-SSE41-NEXT:    movdqu 16(%rsi), %xmm0
-; X64-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X64-SSE41-NEXT:    por %xmm2, %xmm0
-; X64-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X64-SSE41-NEXT:    sete %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX1-LABEL: length32_eq:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vxorps (%rsi), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    sete %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length32_eq:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512-LABEL: length32_eq:
-; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX512-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX512-NEXT:    sete %al
-; X64-AVX512-NEXT:    vzeroupper
-; X64-AVX512-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length32_eq:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %ymm1
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k0
-; X64-MIC-AVX-NEXT:    sete %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length32_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB51_4
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rcx
-; X64-NEXT:    movq 8(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB51_4
-; X64-NEXT:  # %bb.2: # %loadbb2
-; X64-NEXT:    movq 16(%rdi), %rcx
-; X64-NEXT:    movq 16(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB51_4
-; X64-NEXT:  # %bb.3: # %loadbb3
-; X64-NEXT:    movq 24(%rdi), %rcx
-; X64-NEXT:    movq 24(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB51_5
-; X64-NEXT:  .LBB51_4: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB51_5: # %endblock
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length32_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    movq (%rsi), %rcx
-; X64-NEXT:    bswapq %rax
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    jne .LBB52_4
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rax
-; X64-NEXT:    movq 8(%rsi), %rcx
-; X64-NEXT:    bswapq %rax
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    jne .LBB52_4
-; X64-NEXT:  # %bb.2: # %loadbb2
-; X64-NEXT:    movq 16(%rdi), %rax
-; X64-NEXT:    movq 16(%rsi), %rcx
-; X64-NEXT:    bswapq %rax
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    jne .LBB52_4
-; X64-NEXT:  # %bb.3: # %loadbb3
-; X64-NEXT:    movq 24(%rdi), %rax
-; X64-NEXT:    movq 24(%rsi), %rcx
-; X64-NEXT:    bswapq %rax
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    je .LBB52_5
-; X64-NEXT:  .LBB52_4: # %res_block
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    sbbl %edx, %edx
-; X64-NEXT:    orl $1, %edx
-; X64-NEXT:  .LBB52_5: # %endblock
-; X64-NEXT:    testl %edx, %edx
-; X64-NEXT:    setg %al
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
-; X64-SSE2-LABEL: length32_eq_prefer128:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm2
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X64-SSE2-NEXT:    movdqu 16(%rsi), %xmm0
-; X64-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X64-SSE2-NEXT:    pand %xmm2, %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length32_eq_prefer128:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE41-NEXT:    movdqu (%rsi), %xmm2
-; X64-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X64-SSE41-NEXT:    movdqu 16(%rsi), %xmm0
-; X64-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X64-SSE41-NEXT:    por %xmm2, %xmm0
-; X64-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X64-SSE41-NEXT:    sete %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX-LABEL: length32_eq_prefer128:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vmovdqu 16(%rdi), %xmm1
-; X64-AVX-NEXT:    vpxor 16(%rsi), %xmm1, %xmm1
-; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
-; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    sete %al
-; X64-AVX-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length32_eq_prefer128:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-MIC-AVX-NEXT:    vmovdqu 16(%rdi), %xmm1
-; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %xmm2
-; X64-MIC-AVX-NEXT:    vmovdqu 16(%rsi), %xmm3
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm1, %k0
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm0, %k1
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX-NEXT:    sete %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_eq_const(ptr %X) nounwind {
-; X64-SSE2-LABEL: length32_eq_const:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT:    pand %xmm1, %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length32_eq_const:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE41-NEXT:    por %xmm1, %xmm0
-; X64-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X64-SSE41-NEXT:    setne %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX1-LABEL: length32_eq_const:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    setne %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length32_eq_const:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512-LABEL: length32_eq_const:
-; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX512-NEXT:    setne %al
-; X64-AVX512-NEXT:    vzeroupper
-; X64-AVX512-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length32_eq_const:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [858927408,926299444,825243960,892613426,959985462,858927408,926299444,825243960]
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k0
-; X64-MIC-AVX-NEXT:    setne %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 32) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length48(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length48:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $48, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 48) nounwind
-  ret i32 %m
-}
-
-define i1 @length48_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE2-LABEL: length48_eq:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE2-NEXT:    movdqu 32(%rdi), %xmm2
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm3
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm3
-; X64-SSE2-NEXT:    movdqu 16(%rsi), %xmm0
-; X64-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X64-SSE2-NEXT:    pand %xmm3, %xmm0
-; X64-SSE2-NEXT:    movdqu 32(%rsi), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb %xmm2, %xmm1
-; X64-SSE2-NEXT:    pand %xmm0, %xmm1
-; X64-SSE2-NEXT:    pmovmskb %xmm1, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length48_eq:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE41-NEXT:    movdqu 32(%rdi), %xmm2
-; X64-SSE41-NEXT:    movdqu (%rsi), %xmm3
-; X64-SSE41-NEXT:    pxor %xmm0, %xmm3
-; X64-SSE41-NEXT:    movdqu 16(%rsi), %xmm0
-; X64-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X64-SSE41-NEXT:    por %xmm3, %xmm0
-; X64-SSE41-NEXT:    movdqu 32(%rsi), %xmm1
-; X64-SSE41-NEXT:    pxor %xmm2, %xmm1
-; X64-SSE41-NEXT:    por %xmm0, %xmm1
-; X64-SSE41-NEXT:    ptest %xmm1, %xmm1
-; X64-SSE41-NEXT:    sete %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX1-LABEL: length48_eq:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 32(%rdi), %xmm1
-; X64-AVX1-NEXT:    vmovups 32(%rsi), %xmm2
-; X64-AVX1-NEXT:    vxorps (%rsi), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vxorps %ymm2, %ymm1, %ymm1
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    sete %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length48_eq:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %xmm1
-; X64-AVX2-NEXT:    vmovdqu 32(%rsi), %xmm2
-; X64-AVX2-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512-LABEL: length48_eq:
-; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX512-NEXT:    vmovdqu 32(%rdi), %xmm1
-; X64-AVX512-NEXT:    vmovdqu 32(%rsi), %xmm2
-; X64-AVX512-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpxor %ymm2, %ymm1, %ymm1
-; X64-AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX512-NEXT:    sete %al
-; X64-AVX512-NEXT:    vzeroupper
-; X64-AVX512-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length48_eq:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %ymm1
-; X64-MIC-AVX-NEXT:    vmovdqu 32(%rdi), %xmm2
-; X64-MIC-AVX-NEXT:    vmovdqu 32(%rsi), %xmm3
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm2, %k0
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX-NEXT:    sete %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length48_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length48_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $48, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length48_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length48_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $48, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length48_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
-; X64-SSE2-LABEL: length48_eq_prefer128:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE2-NEXT:    movdqu 32(%rdi), %xmm2
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm3
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm3
-; X64-SSE2-NEXT:    movdqu 16(%rsi), %xmm0
-; X64-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X64-SSE2-NEXT:    pand %xmm3, %xmm0
-; X64-SSE2-NEXT:    movdqu 32(%rsi), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb %xmm2, %xmm1
-; X64-SSE2-NEXT:    pand %xmm0, %xmm1
-; X64-SSE2-NEXT:    pmovmskb %xmm1, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length48_eq_prefer128:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE41-NEXT:    movdqu 32(%rdi), %xmm2
-; X64-SSE41-NEXT:    movdqu (%rsi), %xmm3
-; X64-SSE41-NEXT:    pxor %xmm0, %xmm3
-; X64-SSE41-NEXT:    movdqu 16(%rsi), %xmm0
-; X64-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X64-SSE41-NEXT:    por %xmm3, %xmm0
-; X64-SSE41-NEXT:    movdqu 32(%rsi), %xmm1
-; X64-SSE41-NEXT:    pxor %xmm2, %xmm1
-; X64-SSE41-NEXT:    por %xmm0, %xmm1
-; X64-SSE41-NEXT:    ptest %xmm1, %xmm1
-; X64-SSE41-NEXT:    sete %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX-LABEL: length48_eq_prefer128:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vmovdqu 16(%rdi), %xmm1
-; X64-AVX-NEXT:    vmovdqu 32(%rdi), %xmm2
-; X64-AVX-NEXT:    vpxor 16(%rsi), %xmm1, %xmm1
-; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
-; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vpxor 32(%rsi), %xmm2, %xmm1
-; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    sete %al
-; X64-AVX-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length48_eq_prefer128:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-MIC-AVX-NEXT:    vmovdqu 16(%rdi), %xmm1
-; X64-MIC-AVX-NEXT:    vmovdqu 32(%rdi), %xmm2
-; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %xmm3
-; X64-MIC-AVX-NEXT:    vmovdqu 16(%rsi), %xmm4
-; X64-MIC-AVX-NEXT:    vmovdqu 32(%rsi), %xmm5
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm4, %zmm1, %k0
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm0, %k1
-; X64-MIC-AVX-NEXT:    korw %k0, %k1, %k0
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm5, %zmm2, %k1
-; X64-MIC-AVX-NEXT:    kortestw %k1, %k0
-; X64-MIC-AVX-NEXT:    sete %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length48_eq_const(ptr %X) nounwind {
-; X64-SSE2-LABEL: length48_eq_const:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE2-NEXT:    movdqu 32(%rdi), %xmm2
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT:    pand %xmm1, %xmm0
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; X64-SSE2-NEXT:    pand %xmm0, %xmm2
-; X64-SSE2-NEXT:    pmovmskb %xmm2, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length48_eq_const:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE41-NEXT:    movdqu 32(%rdi), %xmm2
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE41-NEXT:    por %xmm1, %xmm0
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; X64-SSE41-NEXT:    por %xmm0, %xmm2
-; X64-SSE41-NEXT:    ptest %xmm2, %xmm2
-; X64-SSE41-NEXT:    setne %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX1-LABEL: length48_eq_const:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 32(%rdi), %xmm1
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    setne %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length48_eq_const:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %xmm1
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512-LABEL: length48_eq_const:
-; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX512-NEXT:    vmovdqu 32(%rdi), %xmm1
-; X64-AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX512-NEXT:    setne %al
-; X64-AVX512-NEXT:    vzeroupper
-; X64-AVX512-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length48_eq_const:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-MIC-AVX-NEXT:    vmovdqu 32(%rdi), %xmm1
-; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} ymm2 = [892613426,959985462,858927408,926299444,0,0,0,0]
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm1, %k0
-; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [858927408,926299444,825243960,892613426,959985462,858927408,926299444,825243960]
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX-NEXT:    setne %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 48) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length63(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length63:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $63, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 63) nounwind
-  ret i32 %m
-}
-
-define i1 @length63_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE2-LABEL: length63_eq:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE2-NEXT:    movdqu 32(%rdi), %xmm2
-; X64-SSE2-NEXT:    movdqu 47(%rdi), %xmm3
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm4
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm4
-; X64-SSE2-NEXT:    movdqu 16(%rsi), %xmm0
-; X64-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X64-SSE2-NEXT:    pand %xmm4, %xmm0
-; X64-SSE2-NEXT:    movdqu 32(%rsi), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb %xmm2, %xmm1
-; X64-SSE2-NEXT:    movdqu 47(%rsi), %xmm2
-; X64-SSE2-NEXT:    pcmpeqb %xmm3, %xmm2
-; X64-SSE2-NEXT:    pand %xmm1, %xmm2
-; X64-SSE2-NEXT:    pand %xmm0, %xmm2
-; X64-SSE2-NEXT:    pmovmskb %xmm2, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length63_eq:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE41-NEXT:    movdqu 32(%rdi), %xmm2
-; X64-SSE41-NEXT:    movdqu 47(%rdi), %xmm3
-; X64-SSE41-NEXT:    movdqu (%rsi), %xmm4
-; X64-SSE41-NEXT:    pxor %xmm0, %xmm4
-; X64-SSE41-NEXT:    movdqu 16(%rsi), %xmm0
-; X64-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X64-SSE41-NEXT:    por %xmm4, %xmm0
-; X64-SSE41-NEXT:    movdqu 32(%rsi), %xmm1
-; X64-SSE41-NEXT:    pxor %xmm2, %xmm1
-; X64-SSE41-NEXT:    movdqu 47(%rsi), %xmm2
-; X64-SSE41-NEXT:    pxor %xmm3, %xmm2
-; X64-SSE41-NEXT:    por %xmm1, %xmm2
-; X64-SSE41-NEXT:    por %xmm0, %xmm2
-; X64-SSE41-NEXT:    ptest %xmm2, %xmm2
-; X64-SSE41-NEXT:    setne %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX1-LABEL: length63_eq:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 31(%rdi), %ymm1
-; X64-AVX1-NEXT:    vxorps 31(%rsi), %ymm1, %ymm1
-; X64-AVX1-NEXT:    vxorps (%rsi), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    setne %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length63_eq:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovdqu 31(%rdi), %ymm1
-; X64-AVX2-NEXT:    vpxor 31(%rsi), %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512-LABEL: length63_eq:
-; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX512-NEXT:    vmovdqu 31(%rdi), %ymm1
-; X64-AVX512-NEXT:    vpxor 31(%rsi), %ymm1, %ymm1
-; X64-AVX512-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX512-NEXT:    setne %al
-; X64-AVX512-NEXT:    vzeroupper
-; X64-AVX512-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length63_eq:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-MIC-AVX-NEXT:    vmovdqu 31(%rdi), %ymm1
-; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %ymm2
-; X64-MIC-AVX-NEXT:    vmovdqu 31(%rsi), %ymm3
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm1, %k0
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm0, %k1
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX-NEXT:    setne %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length63_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length63_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $63, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length63_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length63_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $63, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length63_eq_const(ptr %X) nounwind {
-; X64-SSE2-LABEL: length63_eq_const:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE2-NEXT:    movdqu 32(%rdi), %xmm2
-; X64-SSE2-NEXT:    movdqu 47(%rdi), %xmm3
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; X64-SSE2-NEXT:    pand %xmm3, %xmm2
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT:    pand %xmm1, %xmm0
-; X64-SSE2-NEXT:    pand %xmm2, %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length63_eq_const:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE41-NEXT:    movdqu 32(%rdi), %xmm2
-; X64-SSE41-NEXT:    movdqu 47(%rdi), %xmm3
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; X64-SSE41-NEXT:    por %xmm3, %xmm2
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE41-NEXT:    por %xmm1, %xmm0
-; X64-SSE41-NEXT:    por %xmm2, %xmm0
-; X64-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X64-SSE41-NEXT:    sete %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX1-LABEL: length63_eq_const:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 31(%rdi), %ymm1
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    sete %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length63_eq_const:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovdqu 31(%rdi), %ymm1
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512-LABEL: length63_eq_const:
-; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX512-NEXT:    vmovdqu 31(%rdi), %ymm1
-; X64-AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX512-NEXT:    sete %al
-; X64-AVX512-NEXT:    vzeroupper
-; X64-AVX512-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length63_eq_const:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-MIC-AVX-NEXT:    vmovdqu 31(%rdi), %ymm1
-; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} ymm2 = [875770417,943142453,842084409,909456435,809056311,875770417,943142453,842084409]
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm1, %k0
-; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [858927408,926299444,825243960,892613426,959985462,858927408,926299444,825243960]
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX-NEXT:    sete %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 63) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length64(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length64:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $64, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 64) nounwind
-  ret i32 %m
-}
-
-define i1 @length64_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE2-LABEL: length64_eq:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE2-NEXT:    movdqu 32(%rdi), %xmm2
-; X64-SSE2-NEXT:    movdqu 48(%rdi), %xmm3
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm4
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm4
-; X64-SSE2-NEXT:    movdqu 16(%rsi), %xmm0
-; X64-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X64-SSE2-NEXT:    pand %xmm4, %xmm0
-; X64-SSE2-NEXT:    movdqu 32(%rsi), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb %xmm2, %xmm1
-; X64-SSE2-NEXT:    movdqu 48(%rsi), %xmm2
-; X64-SSE2-NEXT:    pcmpeqb %xmm3, %xmm2
-; X64-SSE2-NEXT:    pand %xmm1, %xmm2
-; X64-SSE2-NEXT:    pand %xmm0, %xmm2
-; X64-SSE2-NEXT:    pmovmskb %xmm2, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length64_eq:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE41-NEXT:    movdqu 32(%rdi), %xmm2
-; X64-SSE41-NEXT:    movdqu 48(%rdi), %xmm3
-; X64-SSE41-NEXT:    movdqu (%rsi), %xmm4
-; X64-SSE41-NEXT:    pxor %xmm0, %xmm4
-; X64-SSE41-NEXT:    movdqu 16(%rsi), %xmm0
-; X64-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X64-SSE41-NEXT:    por %xmm4, %xmm0
-; X64-SSE41-NEXT:    movdqu 32(%rsi), %xmm1
-; X64-SSE41-NEXT:    pxor %xmm2, %xmm1
-; X64-SSE41-NEXT:    movdqu 48(%rsi), %xmm2
-; X64-SSE41-NEXT:    pxor %xmm3, %xmm2
-; X64-SSE41-NEXT:    por %xmm1, %xmm2
-; X64-SSE41-NEXT:    por %xmm0, %xmm2
-; X64-SSE41-NEXT:    ptest %xmm2, %xmm2
-; X64-SSE41-NEXT:    setne %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX1-LABEL: length64_eq:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
-; X64-AVX1-NEXT:    vxorps 32(%rsi), %ymm1, %ymm1
-; X64-AVX1-NEXT:    vxorps (%rsi), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    setne %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length64_eq:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-AVX2-NEXT:    vpxor 32(%rsi), %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512-LABEL: length64_eq:
-; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512-NEXT:    vpcmpneqd (%rsi), %zmm0, %k0
-; X64-AVX512-NEXT:    kortestw %k0, %k0
-; X64-AVX512-NEXT:    setne %al
-; X64-AVX512-NEXT:    vzeroupper
-; X64-AVX512-NEXT:    retq
-;
-; X64-MIC-AVX2-LABEL: length64_eq:
-; X64-MIC-AVX2:       # %bb.0:
-; X64-MIC-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-MIC-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-MIC-AVX2-NEXT:    vmovdqu (%rsi), %ymm2
-; X64-MIC-AVX2-NEXT:    vmovdqu 32(%rsi), %ymm3
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm3, %zmm1, %k0
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm2, %zmm0, %k1
-; X64-MIC-AVX2-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX2-NEXT:    setne %al
-; X64-MIC-AVX2-NEXT:    vzeroupper
-; X64-MIC-AVX2-NEXT:    retq
-;
-; X64-MIC-AVX512F-LABEL: length64_eq:
-; X64-MIC-AVX512F:       # %bb.0:
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd (%rsi), %zmm0, %k0
-; X64-MIC-AVX512F-NEXT:    kortestw %k0, %k0
-; X64-MIC-AVX512F-NEXT:    setne %al
-; X64-MIC-AVX512F-NEXT:    vzeroupper
-; X64-MIC-AVX512F-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length64_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $64, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length64_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $64, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_eq_const(ptr %X) nounwind {
-; X64-SSE2-LABEL: length64_eq_const:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE2-NEXT:    movdqu 32(%rdi), %xmm2
-; X64-SSE2-NEXT:    movdqu 48(%rdi), %xmm3
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; X64-SSE2-NEXT:    pand %xmm3, %xmm2
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT:    pand %xmm1, %xmm0
-; X64-SSE2-NEXT:    pand %xmm2, %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length64_eq_const:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE41-NEXT:    movdqu 32(%rdi), %xmm2
-; X64-SSE41-NEXT:    movdqu 48(%rdi), %xmm3
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; X64-SSE41-NEXT:    por %xmm3, %xmm2
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE41-NEXT:    por %xmm1, %xmm0
-; X64-SSE41-NEXT:    por %xmm2, %xmm0
-; X64-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X64-SSE41-NEXT:    sete %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX1-LABEL: length64_eq_const:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    sete %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length64_eq_const:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512-LABEL: length64_eq_const:
-; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k0
-; X64-AVX512-NEXT:    kortestw %k0, %k0
-; X64-AVX512-NEXT:    sete %al
-; X64-AVX512-NEXT:    vzeroupper
-; X64-AVX512-NEXT:    retq
-;
-; X64-MIC-AVX2-LABEL: length64_eq_const:
-; X64-MIC-AVX2:       # %bb.0:
-; X64-MIC-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-MIC-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-MIC-AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [892613426,959985462,858927408,926299444,825243960,892613426,959985462,858927408]
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm2, %zmm1, %k0
-; X64-MIC-AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [858927408,926299444,825243960,892613426,959985462,858927408,926299444,825243960]
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
-; X64-MIC-AVX2-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX2-NEXT:    sete %al
-; X64-MIC-AVX2-NEXT:    vzeroupper
-; X64-MIC-AVX2-NEXT:    retq
-;
-; X64-MIC-AVX512F-LABEL: length64_eq_const:
-; X64-MIC-AVX512F:       # %bb.0:
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k0
-; X64-MIC-AVX512F-NEXT:    kortestw %k0, %k0
-; X64-MIC-AVX512F-NEXT:    sete %al
-; X64-MIC-AVX512F-NEXT:    vzeroupper
-; X64-MIC-AVX512F-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 64) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length96(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length96:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $96, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 96) nounwind
-  ret i32 %m
-}
-
-define i1 @length96_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE-LABEL: length96_eq:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $96, %edx
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    setne %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length96_eq:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
-; X64-AVX1-NEXT:    vmovups 64(%rdi), %ymm2
-; X64-AVX1-NEXT:    vxorps 32(%rsi), %ymm1, %ymm1
-; X64-AVX1-NEXT:    vxorps (%rsi), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vxorps 64(%rsi), %ymm2, %ymm1
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    setne %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length96_eq:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-AVX2-NEXT:    vmovdqu 64(%rdi), %ymm2
-; X64-AVX2-NEXT:    vpxor 32(%rsi), %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpxor 64(%rsi), %ymm2, %ymm1
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512BW-LABEL: length96_eq:
-; X64-AVX512BW:       # %bb.0:
-; X64-AVX512BW-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512BW-NEXT:    vmovdqu 64(%rdi), %ymm1
-; X64-AVX512BW-NEXT:    vmovdqu 64(%rsi), %ymm2
-; X64-AVX512BW-NEXT:    vpcmpneqb (%rsi), %zmm0, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb %zmm2, %zmm1, %k1
-; X64-AVX512BW-NEXT:    kortestq %k1, %k0
-; X64-AVX512BW-NEXT:    setne %al
-; X64-AVX512BW-NEXT:    vzeroupper
-; X64-AVX512BW-NEXT:    retq
-;
-; X64-AVX512F-LABEL: length96_eq:
-; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512F-NEXT:    vmovdqu 64(%rdi), %ymm1
-; X64-AVX512F-NEXT:    vmovdqu 64(%rsi), %ymm2
-; X64-AVX512F-NEXT:    vpcmpneqd (%rsi), %zmm0, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1
-; X64-AVX512F-NEXT:    kortestw %k1, %k0
-; X64-AVX512F-NEXT:    setne %al
-; X64-AVX512F-NEXT:    vzeroupper
-; X64-AVX512F-NEXT:    retq
-;
-; X64-MIC-AVX2-LABEL: length96_eq:
-; X64-MIC-AVX2:       # %bb.0:
-; X64-MIC-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-MIC-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-MIC-AVX2-NEXT:    vmovdqu 64(%rdi), %ymm2
-; X64-MIC-AVX2-NEXT:    vmovdqu (%rsi), %ymm3
-; X64-MIC-AVX2-NEXT:    vmovdqu 32(%rsi), %ymm4
-; X64-MIC-AVX2-NEXT:    vmovdqu 64(%rsi), %ymm5
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm4, %zmm1, %k0
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm3, %zmm0, %k1
-; X64-MIC-AVX2-NEXT:    korw %k0, %k1, %k0
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm5, %zmm2, %k1
-; X64-MIC-AVX2-NEXT:    kortestw %k1, %k0
-; X64-MIC-AVX2-NEXT:    setne %al
-; X64-MIC-AVX2-NEXT:    vzeroupper
-; X64-MIC-AVX2-NEXT:    retq
-;
-; X64-MIC-AVX512F-LABEL: length96_eq:
-; X64-MIC-AVX512F:       # %bb.0:
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-MIC-AVX512F-NEXT:    vmovdqu 64(%rdi), %ymm1
-; X64-MIC-AVX512F-NEXT:    vmovdqu 64(%rsi), %ymm2
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd (%rsi), %zmm0, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1
-; X64-MIC-AVX512F-NEXT:    kortestw %k1, %k0
-; X64-MIC-AVX512F-NEXT:    setne %al
-; X64-MIC-AVX512F-NEXT:    vzeroupper
-; X64-MIC-AVX512F-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length96_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length96_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $96, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length96_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length96_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $96, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length96_eq_const(ptr %X) nounwind {
-; X64-SSE-LABEL: length96_eq_const:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $.L.str, %esi
-; X64-SSE-NEXT:    movl $96, %edx
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    sete %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length96_eq_const:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
-; X64-AVX1-NEXT:    vmovups 64(%rdi), %ymm2
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    sete %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length96_eq_const:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-AVX2-NEXT:    vmovdqu 64(%rdi), %ymm2
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512BW-LABEL: length96_eq_const:
-; X64-AVX512BW:       # %bb.0:
-; X64-AVX512BW-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512BW-NEXT:    vmovdqu 64(%rdi), %ymm1
-; X64-AVX512BW-NEXT:    vpcmpneqb .L.str(%rip), %zmm0, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %k1
-; X64-AVX512BW-NEXT:    kortestq %k1, %k0
-; X64-AVX512BW-NEXT:    sete %al
-; X64-AVX512BW-NEXT:    vzeroupper
-; X64-AVX512BW-NEXT:    retq
-;
-; X64-AVX512F-LABEL: length96_eq_const:
-; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512F-NEXT:    vmovdqu 64(%rdi), %ymm1
-; X64-AVX512F-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %k1
-; X64-AVX512F-NEXT:    kortestw %k1, %k0
-; X64-AVX512F-NEXT:    sete %al
-; X64-AVX512F-NEXT:    vzeroupper
-; X64-AVX512F-NEXT:    retq
-;
-; X64-MIC-AVX2-LABEL: length96_eq_const:
-; X64-MIC-AVX2:       # %bb.0:
-; X64-MIC-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-MIC-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-MIC-AVX2-NEXT:    vmovdqu 64(%rdi), %ymm2
-; X64-MIC-AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [892613426,959985462,858927408,926299444,825243960,892613426,959985462,858927408]
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm3, %zmm1, %k0
-; X64-MIC-AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [858927408,926299444,825243960,892613426,959985462,858927408,926299444,825243960]
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
-; X64-MIC-AVX2-NEXT:    korw %k0, %k1, %k0
-; X64-MIC-AVX2-NEXT:    vmovdqa {{.*#+}} ymm0 = [926299444,825243960,892613426,959985462,858927408,926299444,825243960,892613426]
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm0, %zmm2, %k1
-; X64-MIC-AVX2-NEXT:    kortestw %k1, %k0
-; X64-MIC-AVX2-NEXT:    sete %al
-; X64-MIC-AVX2-NEXT:    vzeroupper
-; X64-MIC-AVX2-NEXT:    retq
-;
-; X64-MIC-AVX512F-LABEL: length96_eq_const:
-; X64-MIC-AVX512F:       # %bb.0:
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-MIC-AVX512F-NEXT:    vmovdqu 64(%rdi), %ymm1
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %k1
-; X64-MIC-AVX512F-NEXT:    kortestw %k1, %k0
-; X64-MIC-AVX512F-NEXT:    sete %al
-; X64-MIC-AVX512F-NEXT:    vzeroupper
-; X64-MIC-AVX512F-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 96) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length127(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length127:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $127, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 127) nounwind
-  ret i32 %m
-}
-
-define i1 @length127_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE-LABEL: length127_eq:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $127, %edx
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    setne %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length127_eq:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
-; X64-AVX1-NEXT:    vmovups 64(%rdi), %ymm2
-; X64-AVX1-NEXT:    vmovups 95(%rdi), %ymm3
-; X64-AVX1-NEXT:    vxorps 95(%rsi), %ymm3, %ymm3
-; X64-AVX1-NEXT:    vxorps 64(%rsi), %ymm2, %ymm2
-; X64-AVX1-NEXT:    vorps %ymm3, %ymm2, %ymm2
-; X64-AVX1-NEXT:    vxorps 32(%rsi), %ymm1, %ymm1
-; X64-AVX1-NEXT:    vxorps (%rsi), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    setne %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length127_eq:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-AVX2-NEXT:    vmovdqu 64(%rdi), %ymm2
-; X64-AVX2-NEXT:    vmovdqu 95(%rdi), %ymm3
-; X64-AVX2-NEXT:    vpxor 95(%rsi), %ymm3, %ymm3
-; X64-AVX2-NEXT:    vpxor 64(%rsi), %ymm2, %ymm2
-; X64-AVX2-NEXT:    vpor %ymm3, %ymm2, %ymm2
-; X64-AVX2-NEXT:    vpxor 32(%rsi), %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512BW-LABEL: length127_eq:
-; X64-AVX512BW:       # %bb.0:
-; X64-AVX512BW-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512BW-NEXT:    vmovdqu64 63(%rdi), %zmm1
-; X64-AVX512BW-NEXT:    vpcmpneqb 63(%rsi), %zmm1, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb (%rsi), %zmm0, %k1
-; X64-AVX512BW-NEXT:    kortestq %k0, %k1
-; X64-AVX512BW-NEXT:    setne %al
-; X64-AVX512BW-NEXT:    vzeroupper
-; X64-AVX512BW-NEXT:    retq
-;
-; X64-AVX512F-LABEL: length127_eq:
-; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512F-NEXT:    vmovdqu64 63(%rdi), %zmm1
-; X64-AVX512F-NEXT:    vpcmpneqd 63(%rsi), %zmm1, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd (%rsi), %zmm0, %k1
-; X64-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-AVX512F-NEXT:    setne %al
-; X64-AVX512F-NEXT:    vzeroupper
-; X64-AVX512F-NEXT:    retq
-;
-; X64-MIC-AVX2-LABEL: length127_eq:
-; X64-MIC-AVX2:       # %bb.0:
-; X64-MIC-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-MIC-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-MIC-AVX2-NEXT:    vmovdqu 64(%rdi), %ymm2
-; X64-MIC-AVX2-NEXT:    vmovdqu 95(%rdi), %ymm3
-; X64-MIC-AVX2-NEXT:    vmovdqu (%rsi), %ymm4
-; X64-MIC-AVX2-NEXT:    vmovdqu 32(%rsi), %ymm5
-; X64-MIC-AVX2-NEXT:    vmovdqu 64(%rsi), %ymm6
-; X64-MIC-AVX2-NEXT:    vmovdqu 95(%rsi), %ymm7
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm7, %zmm3, %k0
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm6, %zmm2, %k1
-; X64-MIC-AVX2-NEXT:    korw %k0, %k1, %k0
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm5, %zmm1, %k1
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm4, %zmm0, %k2
-; X64-MIC-AVX2-NEXT:    korw %k1, %k2, %k1
-; X64-MIC-AVX2-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX2-NEXT:    setne %al
-; X64-MIC-AVX2-NEXT:    vzeroupper
-; X64-MIC-AVX2-NEXT:    retq
-;
-; X64-MIC-AVX512F-LABEL: length127_eq:
-; X64-MIC-AVX512F:       # %bb.0:
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 63(%rdi), %zmm1
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd 63(%rsi), %zmm1, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd (%rsi), %zmm0, %k1
-; X64-MIC-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX512F-NEXT:    setne %al
-; X64-MIC-AVX512F-NEXT:    vzeroupper
-; X64-MIC-AVX512F-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length127_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length127_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $127, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length127_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length127_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $127, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length127_eq_const(ptr %X) nounwind {
-; X64-SSE-LABEL: length127_eq_const:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $.L.str, %esi
-; X64-SSE-NEXT:    movl $127, %edx
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    sete %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length127_eq_const:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
-; X64-AVX1-NEXT:    vmovups 64(%rdi), %ymm2
-; X64-AVX1-NEXT:    vmovups 95(%rdi), %ymm3
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; X64-AVX1-NEXT:    vorps %ymm3, %ymm2, %ymm2
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    sete %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length127_eq_const:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-AVX2-NEXT:    vmovdqu 64(%rdi), %ymm2
-; X64-AVX2-NEXT:    vmovdqu 95(%rdi), %ymm3
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; X64-AVX2-NEXT:    vpor %ymm3, %ymm2, %ymm2
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512BW-LABEL: length127_eq_const:
-; X64-AVX512BW:       # %bb.0:
-; X64-AVX512BW-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512BW-NEXT:    vmovdqu64 63(%rdi), %zmm1
-; X64-AVX512BW-NEXT:    vpcmpneqb .L.str+63(%rip), %zmm1, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb .L.str(%rip), %zmm0, %k1
-; X64-AVX512BW-NEXT:    kortestq %k0, %k1
-; X64-AVX512BW-NEXT:    sete %al
-; X64-AVX512BW-NEXT:    vzeroupper
-; X64-AVX512BW-NEXT:    retq
-;
-; X64-AVX512F-LABEL: length127_eq_const:
-; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512F-NEXT:    vmovdqu64 63(%rdi), %zmm1
-; X64-AVX512F-NEXT:    vpcmpneqd .L.str+63(%rip), %zmm1, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k1
-; X64-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-AVX512F-NEXT:    sete %al
-; X64-AVX512F-NEXT:    vzeroupper
-; X64-AVX512F-NEXT:    retq
-;
-; X64-MIC-AVX2-LABEL: length127_eq_const:
-; X64-MIC-AVX2:       # %bb.0:
-; X64-MIC-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-MIC-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-MIC-AVX2-NEXT:    vmovdqu 64(%rdi), %ymm2
-; X64-MIC-AVX2-NEXT:    vmovdqu 95(%rdi), %ymm3
-; X64-MIC-AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [943142453,842084409,909456435,809056311,875770417,943142453,842084409,909456435]
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm4, %zmm3, %k0
-; X64-MIC-AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [926299444,825243960,892613426,959985462,858927408,926299444,825243960,892613426]
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm3, %zmm2, %k1
-; X64-MIC-AVX2-NEXT:    korw %k0, %k1, %k0
-; X64-MIC-AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [892613426,959985462,858927408,926299444,825243960,892613426,959985462,858927408]
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1
-; X64-MIC-AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [858927408,926299444,825243960,892613426,959985462,858927408,926299444,825243960]
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm1, %zmm0, %k2
-; X64-MIC-AVX2-NEXT:    korw %k1, %k2, %k1
-; X64-MIC-AVX2-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX2-NEXT:    sete %al
-; X64-MIC-AVX2-NEXT:    vzeroupper
-; X64-MIC-AVX2-NEXT:    retq
-;
-; X64-MIC-AVX512F-LABEL: length127_eq_const:
-; X64-MIC-AVX512F:       # %bb.0:
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 63(%rdi), %zmm1
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str+63(%rip), %zmm1, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k1
-; X64-MIC-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX512F-NEXT:    sete %al
-; X64-MIC-AVX512F-NEXT:    vzeroupper
-; X64-MIC-AVX512F-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 127) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length128(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length128:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $128, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 128) nounwind
-  ret i32 %m
-}
-
-define i1 @length128_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE-LABEL: length128_eq:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $128, %edx
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    setne %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length128_eq:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
-; X64-AVX1-NEXT:    vmovups 64(%rdi), %ymm2
-; X64-AVX1-NEXT:    vmovups 96(%rdi), %ymm3
-; X64-AVX1-NEXT:    vxorps 96(%rsi), %ymm3, %ymm3
-; X64-AVX1-NEXT:    vxorps 64(%rsi), %ymm2, %ymm2
-; X64-AVX1-NEXT:    vorps %ymm3, %ymm2, %ymm2
-; X64-AVX1-NEXT:    vxorps 32(%rsi), %ymm1, %ymm1
-; X64-AVX1-NEXT:    vxorps (%rsi), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    setne %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length128_eq:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-AVX2-NEXT:    vmovdqu 64(%rdi), %ymm2
-; X64-AVX2-NEXT:    vmovdqu 96(%rdi), %ymm3
-; X64-AVX2-NEXT:    vpxor 96(%rsi), %ymm3, %ymm3
-; X64-AVX2-NEXT:    vpxor 64(%rsi), %ymm2, %ymm2
-; X64-AVX2-NEXT:    vpor %ymm3, %ymm2, %ymm2
-; X64-AVX2-NEXT:    vpxor 32(%rsi), %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512BW-LABEL: length128_eq:
-; X64-AVX512BW:       # %bb.0:
-; X64-AVX512BW-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512BW-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-AVX512BW-NEXT:    vpcmpneqb 64(%rsi), %zmm1, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb (%rsi), %zmm0, %k1
-; X64-AVX512BW-NEXT:    kortestq %k0, %k1
-; X64-AVX512BW-NEXT:    setne %al
-; X64-AVX512BW-NEXT:    vzeroupper
-; X64-AVX512BW-NEXT:    retq
-;
-; X64-AVX512F-LABEL: length128_eq:
-; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-AVX512F-NEXT:    vpcmpneqd 64(%rsi), %zmm1, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd (%rsi), %zmm0, %k1
-; X64-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-AVX512F-NEXT:    setne %al
-; X64-AVX512F-NEXT:    vzeroupper
-; X64-AVX512F-NEXT:    retq
-;
-; X64-MIC-AVX2-LABEL: length128_eq:
-; X64-MIC-AVX2:       # %bb.0:
-; X64-MIC-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-MIC-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-MIC-AVX2-NEXT:    vmovdqu 64(%rdi), %ymm2
-; X64-MIC-AVX2-NEXT:    vmovdqu 96(%rdi), %ymm3
-; X64-MIC-AVX2-NEXT:    vmovdqu (%rsi), %ymm4
-; X64-MIC-AVX2-NEXT:    vmovdqu 32(%rsi), %ymm5
-; X64-MIC-AVX2-NEXT:    vmovdqu 64(%rsi), %ymm6
-; X64-MIC-AVX2-NEXT:    vmovdqu 96(%rsi), %ymm7
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm7, %zmm3, %k0
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm6, %zmm2, %k1
-; X64-MIC-AVX2-NEXT:    korw %k0, %k1, %k0
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm5, %zmm1, %k1
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm4, %zmm0, %k2
-; X64-MIC-AVX2-NEXT:    korw %k1, %k2, %k1
-; X64-MIC-AVX2-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX2-NEXT:    setne %al
-; X64-MIC-AVX2-NEXT:    vzeroupper
-; X64-MIC-AVX2-NEXT:    retq
-;
-; X64-MIC-AVX512F-LABEL: length128_eq:
-; X64-MIC-AVX512F:       # %bb.0:
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd 64(%rsi), %zmm1, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd (%rsi), %zmm0, %k1
-; X64-MIC-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX512F-NEXT:    setne %al
-; X64-MIC-AVX512F-NEXT:    vzeroupper
-; X64-MIC-AVX512F-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length128_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length128_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $128, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length128_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length128_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $128, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length128_eq_const(ptr %X) nounwind {
-; X64-SSE-LABEL: length128_eq_const:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $.L.str, %esi
-; X64-SSE-NEXT:    movl $128, %edx
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    sete %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length128_eq_const:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
-; X64-AVX1-NEXT:    vmovups 64(%rdi), %ymm2
-; X64-AVX1-NEXT:    vmovups 96(%rdi), %ymm3
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; X64-AVX1-NEXT:    vorps %ymm3, %ymm2, %ymm2
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    sete %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length128_eq_const:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-AVX2-NEXT:    vmovdqu 64(%rdi), %ymm2
-; X64-AVX2-NEXT:    vmovdqu 96(%rdi), %ymm3
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; X64-AVX2-NEXT:    vpor %ymm3, %ymm2, %ymm2
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512BW-LABEL: length128_eq_const:
-; X64-AVX512BW:       # %bb.0:
-; X64-AVX512BW-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512BW-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-AVX512BW-NEXT:    vpcmpneqb .L.str+64(%rip), %zmm1, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb .L.str(%rip), %zmm0, %k1
-; X64-AVX512BW-NEXT:    kortestq %k0, %k1
-; X64-AVX512BW-NEXT:    sete %al
-; X64-AVX512BW-NEXT:    vzeroupper
-; X64-AVX512BW-NEXT:    retq
-;
-; X64-AVX512F-LABEL: length128_eq_const:
-; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-AVX512F-NEXT:    vpcmpneqd .L.str+64(%rip), %zmm1, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k1
-; X64-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-AVX512F-NEXT:    sete %al
-; X64-AVX512F-NEXT:    vzeroupper
-; X64-AVX512F-NEXT:    retq
-;
-; X64-MIC-AVX2-LABEL: length128_eq_const:
-; X64-MIC-AVX2:       # %bb.0:
-; X64-MIC-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-MIC-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-MIC-AVX2-NEXT:    vmovdqu 64(%rdi), %ymm2
-; X64-MIC-AVX2-NEXT:    vmovdqu 96(%rdi), %ymm3
-; X64-MIC-AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [959985462,858927408,926299444,825243960,892613426,959985462,858927408,926299444]
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm4, %zmm3, %k0
-; X64-MIC-AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [926299444,825243960,892613426,959985462,858927408,926299444,825243960,892613426]
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm3, %zmm2, %k1
-; X64-MIC-AVX2-NEXT:    korw %k0, %k1, %k0
-; X64-MIC-AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [892613426,959985462,858927408,926299444,825243960,892613426,959985462,858927408]
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1
-; X64-MIC-AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [858927408,926299444,825243960,892613426,959985462,858927408,926299444,825243960]
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm1, %zmm0, %k2
-; X64-MIC-AVX2-NEXT:    korw %k1, %k2, %k1
-; X64-MIC-AVX2-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX2-NEXT:    sete %al
-; X64-MIC-AVX2-NEXT:    vzeroupper
-; X64-MIC-AVX2-NEXT:    retq
-;
-; X64-MIC-AVX512F-LABEL: length128_eq_const:
-; X64-MIC-AVX512F:       # %bb.0:
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str+64(%rip), %zmm1, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k1
-; X64-MIC-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX512F-NEXT:    sete %al
-; X64-MIC-AVX512F-NEXT:    vzeroupper
-; X64-MIC-AVX512F-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 128) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length192(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length192:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $192, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 192) nounwind
-  ret i32 %m
-}
-
-define i1 @length192_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE-LABEL: length192_eq:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $192, %edx
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    setne %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length192_eq:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    pushq %rax
-; X64-AVX1-NEXT:    movl $192, %edx
-; X64-AVX1-NEXT:    callq memcmp
-; X64-AVX1-NEXT:    testl %eax, %eax
-; X64-AVX1-NEXT:    setne %al
-; X64-AVX1-NEXT:    popq %rcx
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length192_eq:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    pushq %rax
-; X64-AVX2-NEXT:    movl $192, %edx
-; X64-AVX2-NEXT:    callq memcmp
-; X64-AVX2-NEXT:    testl %eax, %eax
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    popq %rcx
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512BW-LABEL: length192_eq:
-; X64-AVX512BW:       # %bb.0:
-; X64-AVX512BW-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512BW-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-AVX512BW-NEXT:    vmovdqu64 128(%rdi), %zmm2
-; X64-AVX512BW-NEXT:    vpcmpneqb 64(%rsi), %zmm1, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb (%rsi), %zmm0, %k1
-; X64-AVX512BW-NEXT:    korq %k0, %k1, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb 128(%rsi), %zmm2, %k1
-; X64-AVX512BW-NEXT:    kortestq %k1, %k0
-; X64-AVX512BW-NEXT:    setne %al
-; X64-AVX512BW-NEXT:    vzeroupper
-; X64-AVX512BW-NEXT:    retq
-;
-; X64-AVX512F-LABEL: length192_eq:
-; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-AVX512F-NEXT:    vmovdqu64 128(%rdi), %zmm2
-; X64-AVX512F-NEXT:    vpcmpneqd 64(%rsi), %zmm1, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd (%rsi), %zmm0, %k1
-; X64-AVX512F-NEXT:    korw %k0, %k1, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd 128(%rsi), %zmm2, %k1
-; X64-AVX512F-NEXT:    kortestw %k1, %k0
-; X64-AVX512F-NEXT:    setne %al
-; X64-AVX512F-NEXT:    vzeroupper
-; X64-AVX512F-NEXT:    retq
-;
-; X64-MIC-AVX2-LABEL: length192_eq:
-; X64-MIC-AVX2:       # %bb.0:
-; X64-MIC-AVX2-NEXT:    pushq %rax
-; X64-MIC-AVX2-NEXT:    movl $192, %edx
-; X64-MIC-AVX2-NEXT:    callq memcmp
-; X64-MIC-AVX2-NEXT:    testl %eax, %eax
-; X64-MIC-AVX2-NEXT:    setne %al
-; X64-MIC-AVX2-NEXT:    popq %rcx
-; X64-MIC-AVX2-NEXT:    retq
-;
-; X64-MIC-AVX512F-LABEL: length192_eq:
-; X64-MIC-AVX512F:       # %bb.0:
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 128(%rdi), %zmm2
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd 64(%rsi), %zmm1, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd (%rsi), %zmm0, %k1
-; X64-MIC-AVX512F-NEXT:    korw %k0, %k1, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd 128(%rsi), %zmm2, %k1
-; X64-MIC-AVX512F-NEXT:    kortestw %k1, %k0
-; X64-MIC-AVX512F-NEXT:    setne %al
-; X64-MIC-AVX512F-NEXT:    vzeroupper
-; X64-MIC-AVX512F-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length192_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length192_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $192, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length192_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length192_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $192, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length192_eq_const(ptr %X) nounwind {
-; X64-SSE-LABEL: length192_eq_const:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $.L.str, %esi
-; X64-SSE-NEXT:    movl $192, %edx
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    sete %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length192_eq_const:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    pushq %rax
-; X64-AVX1-NEXT:    movl $.L.str, %esi
-; X64-AVX1-NEXT:    movl $192, %edx
-; X64-AVX1-NEXT:    callq memcmp
-; X64-AVX1-NEXT:    testl %eax, %eax
-; X64-AVX1-NEXT:    sete %al
-; X64-AVX1-NEXT:    popq %rcx
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length192_eq_const:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    pushq %rax
-; X64-AVX2-NEXT:    movl $.L.str, %esi
-; X64-AVX2-NEXT:    movl $192, %edx
-; X64-AVX2-NEXT:    callq memcmp
-; X64-AVX2-NEXT:    testl %eax, %eax
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    popq %rcx
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512BW-LABEL: length192_eq_const:
-; X64-AVX512BW:       # %bb.0:
-; X64-AVX512BW-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512BW-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-AVX512BW-NEXT:    vmovdqu64 128(%rdi), %zmm2
-; X64-AVX512BW-NEXT:    vpcmpneqb .L.str+64(%rip), %zmm1, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb .L.str(%rip), %zmm0, %k1
-; X64-AVX512BW-NEXT:    korq %k0, %k1, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb .L.str+128(%rip), %zmm2, %k1
-; X64-AVX512BW-NEXT:    kortestq %k1, %k0
-; X64-AVX512BW-NEXT:    sete %al
-; X64-AVX512BW-NEXT:    vzeroupper
-; X64-AVX512BW-NEXT:    retq
-;
-; X64-AVX512F-LABEL: length192_eq_const:
-; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-AVX512F-NEXT:    vmovdqu64 128(%rdi), %zmm2
-; X64-AVX512F-NEXT:    vpcmpneqd .L.str+64(%rip), %zmm1, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k1
-; X64-AVX512F-NEXT:    korw %k0, %k1, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd .L.str+128(%rip), %zmm2, %k1
-; X64-AVX512F-NEXT:    kortestw %k1, %k0
-; X64-AVX512F-NEXT:    sete %al
-; X64-AVX512F-NEXT:    vzeroupper
-; X64-AVX512F-NEXT:    retq
-;
-; X64-MIC-AVX2-LABEL: length192_eq_const:
-; X64-MIC-AVX2:       # %bb.0:
-; X64-MIC-AVX2-NEXT:    pushq %rax
-; X64-MIC-AVX2-NEXT:    movl $.L.str, %esi
-; X64-MIC-AVX2-NEXT:    movl $192, %edx
-; X64-MIC-AVX2-NEXT:    callq memcmp
-; X64-MIC-AVX2-NEXT:    testl %eax, %eax
-; X64-MIC-AVX2-NEXT:    sete %al
-; X64-MIC-AVX2-NEXT:    popq %rcx
-; X64-MIC-AVX2-NEXT:    retq
-;
-; X64-MIC-AVX512F-LABEL: length192_eq_const:
-; X64-MIC-AVX512F:       # %bb.0:
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 128(%rdi), %zmm2
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str+64(%rip), %zmm1, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k1
-; X64-MIC-AVX512F-NEXT:    korw %k0, %k1, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str+128(%rip), %zmm2, %k1
-; X64-MIC-AVX512F-NEXT:    kortestw %k1, %k0
-; X64-MIC-AVX512F-NEXT:    sete %al
-; X64-MIC-AVX512F-NEXT:    vzeroupper
-; X64-MIC-AVX512F-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 192) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length255(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length255:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $255, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 255) nounwind
-  ret i32 %m
-}
-
-define i1 @length255_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE-LABEL: length255_eq:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $255, %edx
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    setne %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length255_eq:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    pushq %rax
-; X64-AVX1-NEXT:    movl $255, %edx
-; X64-AVX1-NEXT:    callq memcmp
-; X64-AVX1-NEXT:    testl %eax, %eax
-; X64-AVX1-NEXT:    setne %al
-; X64-AVX1-NEXT:    popq %rcx
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length255_eq:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    pushq %rax
-; X64-AVX2-NEXT:    movl $255, %edx
-; X64-AVX2-NEXT:    callq memcmp
-; X64-AVX2-NEXT:    testl %eax, %eax
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    popq %rcx
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512BW-LABEL: length255_eq:
-; X64-AVX512BW:       # %bb.0:
-; X64-AVX512BW-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512BW-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-AVX512BW-NEXT:    vmovdqu64 128(%rdi), %zmm2
-; X64-AVX512BW-NEXT:    vmovdqu64 191(%rdi), %zmm3
-; X64-AVX512BW-NEXT:    vpcmpneqb 191(%rsi), %zmm3, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb 128(%rsi), %zmm2, %k1
-; X64-AVX512BW-NEXT:    korq %k0, %k1, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb 64(%rsi), %zmm1, %k1
-; X64-AVX512BW-NEXT:    vpcmpneqb (%rsi), %zmm0, %k2
-; X64-AVX512BW-NEXT:    korq %k1, %k2, %k1
-; X64-AVX512BW-NEXT:    kortestq %k0, %k1
-; X64-AVX512BW-NEXT:    setne %al
-; X64-AVX512BW-NEXT:    vzeroupper
-; X64-AVX512BW-NEXT:    retq
-;
-; X64-AVX512F-LABEL: length255_eq:
-; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-AVX512F-NEXT:    vmovdqu64 128(%rdi), %zmm2
-; X64-AVX512F-NEXT:    vmovdqu64 191(%rdi), %zmm3
-; X64-AVX512F-NEXT:    vpcmpneqd 191(%rsi), %zmm3, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd 128(%rsi), %zmm2, %k1
-; X64-AVX512F-NEXT:    korw %k0, %k1, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd 64(%rsi), %zmm1, %k1
-; X64-AVX512F-NEXT:    vpcmpneqd (%rsi), %zmm0, %k2
-; X64-AVX512F-NEXT:    korw %k1, %k2, %k1
-; X64-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-AVX512F-NEXT:    setne %al
-; X64-AVX512F-NEXT:    vzeroupper
-; X64-AVX512F-NEXT:    retq
-;
-; X64-MIC-AVX2-LABEL: length255_eq:
-; X64-MIC-AVX2:       # %bb.0:
-; X64-MIC-AVX2-NEXT:    pushq %rax
-; X64-MIC-AVX2-NEXT:    movl $255, %edx
-; X64-MIC-AVX2-NEXT:    callq memcmp
-; X64-MIC-AVX2-NEXT:    testl %eax, %eax
-; X64-MIC-AVX2-NEXT:    setne %al
-; X64-MIC-AVX2-NEXT:    popq %rcx
-; X64-MIC-AVX2-NEXT:    retq
-;
-; X64-MIC-AVX512F-LABEL: length255_eq:
-; X64-MIC-AVX512F:       # %bb.0:
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 128(%rdi), %zmm2
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 191(%rdi), %zmm3
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd 191(%rsi), %zmm3, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd 128(%rsi), %zmm2, %k1
-; X64-MIC-AVX512F-NEXT:    korw %k0, %k1, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd 64(%rsi), %zmm1, %k1
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd (%rsi), %zmm0, %k2
-; X64-MIC-AVX512F-NEXT:    korw %k1, %k2, %k1
-; X64-MIC-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX512F-NEXT:    setne %al
-; X64-MIC-AVX512F-NEXT:    vzeroupper
-; X64-MIC-AVX512F-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length255_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length255_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $255, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length255_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length255_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $255, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length255_eq_const(ptr %X) nounwind {
-; X64-SSE-LABEL: length255_eq_const:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $.L.str, %esi
-; X64-SSE-NEXT:    movl $255, %edx
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    sete %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length255_eq_const:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    pushq %rax
-; X64-AVX1-NEXT:    movl $.L.str, %esi
-; X64-AVX1-NEXT:    movl $255, %edx
-; X64-AVX1-NEXT:    callq memcmp
-; X64-AVX1-NEXT:    testl %eax, %eax
-; X64-AVX1-NEXT:    sete %al
-; X64-AVX1-NEXT:    popq %rcx
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length255_eq_const:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    pushq %rax
-; X64-AVX2-NEXT:    movl $.L.str, %esi
-; X64-AVX2-NEXT:    movl $255, %edx
-; X64-AVX2-NEXT:    callq memcmp
-; X64-AVX2-NEXT:    testl %eax, %eax
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    popq %rcx
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512BW-LABEL: length255_eq_const:
-; X64-AVX512BW:       # %bb.0:
-; X64-AVX512BW-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512BW-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-AVX512BW-NEXT:    vmovdqu64 128(%rdi), %zmm2
-; X64-AVX512BW-NEXT:    vmovdqu64 191(%rdi), %zmm3
-; X64-AVX512BW-NEXT:    vpcmpneqb .L.str+191(%rip), %zmm3, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb .L.str+128(%rip), %zmm2, %k1
-; X64-AVX512BW-NEXT:    korq %k0, %k1, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb .L.str+64(%rip), %zmm1, %k1
-; X64-AVX512BW-NEXT:    vpcmpneqb .L.str(%rip), %zmm0, %k2
-; X64-AVX512BW-NEXT:    korq %k1, %k2, %k1
-; X64-AVX512BW-NEXT:    kortestq %k0, %k1
-; X64-AVX512BW-NEXT:    sete %al
-; X64-AVX512BW-NEXT:    vzeroupper
-; X64-AVX512BW-NEXT:    retq
-;
-; X64-AVX512F-LABEL: length255_eq_const:
-; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-AVX512F-NEXT:    vmovdqu64 128(%rdi), %zmm2
-; X64-AVX512F-NEXT:    vmovdqu64 191(%rdi), %zmm3
-; X64-AVX512F-NEXT:    vpcmpneqd .L.str+191(%rip), %zmm3, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd .L.str+128(%rip), %zmm2, %k1
-; X64-AVX512F-NEXT:    korw %k0, %k1, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd .L.str+64(%rip), %zmm1, %k1
-; X64-AVX512F-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k2
-; X64-AVX512F-NEXT:    korw %k1, %k2, %k1
-; X64-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-AVX512F-NEXT:    sete %al
-; X64-AVX512F-NEXT:    vzeroupper
-; X64-AVX512F-NEXT:    retq
-;
-; X64-MIC-AVX2-LABEL: length255_eq_const:
-; X64-MIC-AVX2:       # %bb.0:
-; X64-MIC-AVX2-NEXT:    pushq %rax
-; X64-MIC-AVX2-NEXT:    movl $.L.str, %esi
-; X64-MIC-AVX2-NEXT:    movl $255, %edx
-; X64-MIC-AVX2-NEXT:    callq memcmp
-; X64-MIC-AVX2-NEXT:    testl %eax, %eax
-; X64-MIC-AVX2-NEXT:    sete %al
-; X64-MIC-AVX2-NEXT:    popq %rcx
-; X64-MIC-AVX2-NEXT:    retq
-;
-; X64-MIC-AVX512F-LABEL: length255_eq_const:
-; X64-MIC-AVX512F:       # %bb.0:
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 128(%rdi), %zmm2
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 191(%rdi), %zmm3
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str+191(%rip), %zmm3, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str+128(%rip), %zmm2, %k1
-; X64-MIC-AVX512F-NEXT:    korw %k0, %k1, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str+64(%rip), %zmm1, %k1
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k2
-; X64-MIC-AVX512F-NEXT:    korw %k1, %k2, %k1
-; X64-MIC-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX512F-NEXT:    sete %al
-; X64-MIC-AVX512F-NEXT:    vzeroupper
-; X64-MIC-AVX512F-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 255) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length256(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length256:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $256, %edx # imm = 0x100
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 256) nounwind
-  ret i32 %m
-}
-
-define i1 @length256_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE-LABEL: length256_eq:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $256, %edx # imm = 0x100
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    setne %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length256_eq:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    pushq %rax
-; X64-AVX1-NEXT:    movl $256, %edx # imm = 0x100
-; X64-AVX1-NEXT:    callq memcmp
-; X64-AVX1-NEXT:    testl %eax, %eax
-; X64-AVX1-NEXT:    setne %al
-; X64-AVX1-NEXT:    popq %rcx
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length256_eq:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    pushq %rax
-; X64-AVX2-NEXT:    movl $256, %edx # imm = 0x100
-; X64-AVX2-NEXT:    callq memcmp
-; X64-AVX2-NEXT:    testl %eax, %eax
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    popq %rcx
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512BW-LABEL: length256_eq:
-; X64-AVX512BW:       # %bb.0:
-; X64-AVX512BW-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512BW-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-AVX512BW-NEXT:    vmovdqu64 128(%rdi), %zmm2
-; X64-AVX512BW-NEXT:    vmovdqu64 192(%rdi), %zmm3
-; X64-AVX512BW-NEXT:    vpcmpneqb 192(%rsi), %zmm3, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb 128(%rsi), %zmm2, %k1
-; X64-AVX512BW-NEXT:    korq %k0, %k1, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb 64(%rsi), %zmm1, %k1
-; X64-AVX512BW-NEXT:    vpcmpneqb (%rsi), %zmm0, %k2
-; X64-AVX512BW-NEXT:    korq %k1, %k2, %k1
-; X64-AVX512BW-NEXT:    kortestq %k0, %k1
-; X64-AVX512BW-NEXT:    setne %al
-; X64-AVX512BW-NEXT:    vzeroupper
-; X64-AVX512BW-NEXT:    retq
-;
-; X64-AVX512F-LABEL: length256_eq:
-; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-AVX512F-NEXT:    vmovdqu64 128(%rdi), %zmm2
-; X64-AVX512F-NEXT:    vmovdqu64 192(%rdi), %zmm3
-; X64-AVX512F-NEXT:    vpcmpneqd 192(%rsi), %zmm3, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd 128(%rsi), %zmm2, %k1
-; X64-AVX512F-NEXT:    korw %k0, %k1, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd 64(%rsi), %zmm1, %k1
-; X64-AVX512F-NEXT:    vpcmpneqd (%rsi), %zmm0, %k2
-; X64-AVX512F-NEXT:    korw %k1, %k2, %k1
-; X64-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-AVX512F-NEXT:    setne %al
-; X64-AVX512F-NEXT:    vzeroupper
-; X64-AVX512F-NEXT:    retq
-;
-; X64-MIC-AVX2-LABEL: length256_eq:
-; X64-MIC-AVX2:       # %bb.0:
-; X64-MIC-AVX2-NEXT:    pushq %rax
-; X64-MIC-AVX2-NEXT:    movl $256, %edx # imm = 0x100
-; X64-MIC-AVX2-NEXT:    callq memcmp
-; X64-MIC-AVX2-NEXT:    testl %eax, %eax
-; X64-MIC-AVX2-NEXT:    setne %al
-; X64-MIC-AVX2-NEXT:    popq %rcx
-; X64-MIC-AVX2-NEXT:    retq
-;
-; X64-MIC-AVX512F-LABEL: length256_eq:
-; X64-MIC-AVX512F:       # %bb.0:
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 128(%rdi), %zmm2
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 192(%rdi), %zmm3
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd 192(%rsi), %zmm3, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd 128(%rsi), %zmm2, %k1
-; X64-MIC-AVX512F-NEXT:    korw %k0, %k1, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd 64(%rsi), %zmm1, %k1
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd (%rsi), %zmm0, %k2
-; X64-MIC-AVX512F-NEXT:    korw %k1, %k2, %k1
-; X64-MIC-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX512F-NEXT:    setne %al
-; X64-MIC-AVX512F-NEXT:    vzeroupper
-; X64-MIC-AVX512F-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length256_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length256_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $256, %edx # imm = 0x100
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length256_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length256_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $256, %edx # imm = 0x100
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length256_eq_const(ptr %X) nounwind {
-; X64-SSE-LABEL: length256_eq_const:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $.L.str, %esi
-; X64-SSE-NEXT:    movl $256, %edx # imm = 0x100
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    sete %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length256_eq_const:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    pushq %rax
-; X64-AVX1-NEXT:    movl $.L.str, %esi
-; X64-AVX1-NEXT:    movl $256, %edx # imm = 0x100
-; X64-AVX1-NEXT:    callq memcmp
-; X64-AVX1-NEXT:    testl %eax, %eax
-; X64-AVX1-NEXT:    sete %al
-; X64-AVX1-NEXT:    popq %rcx
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length256_eq_const:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    pushq %rax
-; X64-AVX2-NEXT:    movl $.L.str, %esi
-; X64-AVX2-NEXT:    movl $256, %edx # imm = 0x100
-; X64-AVX2-NEXT:    callq memcmp
-; X64-AVX2-NEXT:    testl %eax, %eax
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    popq %rcx
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512BW-LABEL: length256_eq_const:
-; X64-AVX512BW:       # %bb.0:
-; X64-AVX512BW-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512BW-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-AVX512BW-NEXT:    vmovdqu64 128(%rdi), %zmm2
-; X64-AVX512BW-NEXT:    vmovdqu64 192(%rdi), %zmm3
-; X64-AVX512BW-NEXT:    vpcmpneqb .L.str+192(%rip), %zmm3, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb .L.str+128(%rip), %zmm2, %k1
-; X64-AVX512BW-NEXT:    korq %k0, %k1, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb .L.str+64(%rip), %zmm1, %k1
-; X64-AVX512BW-NEXT:    vpcmpneqb .L.str(%rip), %zmm0, %k2
-; X64-AVX512BW-NEXT:    korq %k1, %k2, %k1
-; X64-AVX512BW-NEXT:    kortestq %k0, %k1
-; X64-AVX512BW-NEXT:    sete %al
-; X64-AVX512BW-NEXT:    vzeroupper
-; X64-AVX512BW-NEXT:    retq
-;
-; X64-AVX512F-LABEL: length256_eq_const:
-; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-AVX512F-NEXT:    vmovdqu64 128(%rdi), %zmm2
-; X64-AVX512F-NEXT:    vmovdqu64 192(%rdi), %zmm3
-; X64-AVX512F-NEXT:    vpcmpneqd .L.str+192(%rip), %zmm3, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd .L.str+128(%rip), %zmm2, %k1
-; X64-AVX512F-NEXT:    korw %k0, %k1, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd .L.str+64(%rip), %zmm1, %k1
-; X64-AVX512F-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k2
-; X64-AVX512F-NEXT:    korw %k1, %k2, %k1
-; X64-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-AVX512F-NEXT:    sete %al
-; X64-AVX512F-NEXT:    vzeroupper
-; X64-AVX512F-NEXT:    retq
-;
-; X64-MIC-AVX2-LABEL: length256_eq_const:
-; X64-MIC-AVX2:       # %bb.0:
-; X64-MIC-AVX2-NEXT:    pushq %rax
-; X64-MIC-AVX2-NEXT:    movl $.L.str, %esi
-; X64-MIC-AVX2-NEXT:    movl $256, %edx # imm = 0x100
-; X64-MIC-AVX2-NEXT:    callq memcmp
-; X64-MIC-AVX2-NEXT:    testl %eax, %eax
-; X64-MIC-AVX2-NEXT:    sete %al
-; X64-MIC-AVX2-NEXT:    popq %rcx
-; X64-MIC-AVX2-NEXT:    retq
-;
-; X64-MIC-AVX512F-LABEL: length256_eq_const:
-; X64-MIC-AVX512F:       # %bb.0:
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 128(%rdi), %zmm2
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 192(%rdi), %zmm3
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str+192(%rip), %zmm3, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str+128(%rip), %zmm2, %k1
-; X64-MIC-AVX512F-NEXT:    korw %k0, %k1, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str+64(%rip), %zmm1, %k1
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k2
-; X64-MIC-AVX512F-NEXT:    korw %k1, %k2, %k1
-; X64-MIC-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX512F-NEXT:    sete %al
-; X64-MIC-AVX512F-NEXT:    vzeroupper
-; X64-MIC-AVX512F-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 256) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length384(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length384:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $384, %edx # imm = 0x180
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 384) nounwind
-  ret i32 %m
-}
-
-define i1 @length384_eq(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length384_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $384, %edx # imm = 0x180
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length384_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length384_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $384, %edx # imm = 0x180
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length384_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length384_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $384, %edx # imm = 0x180
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length384_eq_const(ptr %X) nounwind {
-; X64-LABEL: length384_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $.L.str, %esi
-; X64-NEXT:    movl $384, %edx # imm = 0x180
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 384) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length511(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length511:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $511, %edx # imm = 0x1FF
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 511) nounwind
-  ret i32 %m
-}
-
-define i1 @length511_eq(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length511_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $511, %edx # imm = 0x1FF
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length511_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length511_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $511, %edx # imm = 0x1FF
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length511_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length511_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $511, %edx # imm = 0x1FF
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length511_eq_const(ptr %X) nounwind {
-; X64-LABEL: length511_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $.L.str, %esi
-; X64-NEXT:    movl $511, %edx # imm = 0x1FF
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 511) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length512(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length512:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $512, %edx # imm = 0x200
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 512) nounwind
-  ret i32 %m
-}
-
-define i1 @length512_eq(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length512_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $512, %edx # imm = 0x200
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length512_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length512_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $512, %edx # imm = 0x200
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length512_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length512_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $512, %edx # imm = 0x200
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length512_eq_const(ptr %X) nounwind {
-; X64-LABEL: length512_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $.L.str, %esi
-; X64-NEXT:    movl $512, %edx # imm = 0x200
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 512) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-; This checks that we do not do stupid things with huge sizes.
-define i32 @huge_length(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: huge_length:
-; X64:       # %bb.0:
-; X64-NEXT:    movabsq $9223372036854775807, %rdx # imm = 0x7FFFFFFFFFFFFFFF
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9223372036854775807) nounwind
-  ret i32 %m
-}
-
-define i1 @huge_length_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: huge_length_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movabsq $9223372036854775807, %rdx # imm = 0x7FFFFFFFFFFFFFFF
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9223372036854775807) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-; This checks non-constant sizes.
-define i32 @nonconst_length(ptr %X, ptr %Y, i64 %size) nounwind {
-; X64-LABEL: nonconst_length:
-; X64:       # %bb.0:
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 %size) nounwind
-  ret i32 %m
-}
-
-define i1 @nonconst_length_eq(ptr %X, ptr %Y, i64 %size) nounwind {
-; X64-LABEL: nonconst_length_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 %size) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
diff --git a/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll b/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll
deleted file mode 100644
index 762691151f4bd3..00000000000000
--- a/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll
+++ /dev/null
@@ -1,583 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2
-
-; This tests codegen time inlining/optimization of memcmp
-; rdar://6480398
-
- at .str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1
-
-declare dso_local i32 @memcmp(ptr, ptr, i32)
-declare dso_local i32 @bcmp(ptr, ptr, i32)
-
-define i32 @length2(ptr %X, ptr %Y) nounwind optsize {
-; X86-LABEL: length2:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    movzwl %dx, %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
-  ret i32 %m
-}
-
-define i1 @length2_eq(ptr %X, ptr %Y) nounwind optsize {
-; X86-LABEL: length2_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    cmpw (%eax), %cx
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_const(ptr %X) nounwind optsize {
-; X86-LABEL: length2_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    cmpl $12849, %eax # imm = 0x3231
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i32 2) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind optsize {
-; X86-LABEL: length2_eq_nobuiltin_attr:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $2
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind nobuiltin
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length3(ptr %X, ptr %Y) nounwind optsize {
-; X86-LABEL: length3:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    movzwl (%ecx), %esi
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    rolw $8, %si
-; X86-NEXT:    cmpw %si, %dx
-; X86-NEXT:    jne .LBB4_3
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 2(%eax), %eax
-; X86-NEXT:    movzbl 2(%ecx), %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    jmp .LBB4_2
-; X86-NEXT:  .LBB4_3: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpw %si, %dx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl $1, %eax
-; X86-NEXT:  .LBB4_2: # %endblock
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind
-  ret i32 %m
-}
-
-define i1 @length3_eq(ptr %X, ptr %Y) nounwind optsize {
-; X86-LABEL: length3_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %edx
-; X86-NEXT:    xorw (%eax), %dx
-; X86-NEXT:    movb 2(%ecx), %cl
-; X86-NEXT:    xorb 2(%eax), %cl
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    orw %dx, %ax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length4(ptr %X, ptr %Y) nounwind optsize {
-; X86-LABEL: length4:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    seta %al
-; X86-NEXT:    sbbl $0, %eax
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
-  ret i32 %m
-}
-
-define i1 @length4_eq(ptr %X, ptr %Y) nounwind optsize {
-; X86-LABEL: length4_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    cmpl (%eax), %ecx
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length4_eq_const(ptr %X) nounwind optsize {
-; X86-LABEL: length4_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl $875770417, (%eax) # imm = 0x34333231
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i32 4) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length5(ptr %X, ptr %Y) nounwind optsize {
-; X86-LABEL: length5:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    movl (%ecx), %esi
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    bswapl %esi
-; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    jne .LBB9_3
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 4(%eax), %eax
-; X86-NEXT:    movzbl 4(%ecx), %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    jmp .LBB9_2
-; X86-NEXT:  .LBB9_3: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl $1, %eax
-; X86-NEXT:  .LBB9_2: # %endblock
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
-  ret i32 %m
-}
-
-define i1 @length5_eq(ptr %X, ptr %Y) nounwind optsize {
-; X86-LABEL: length5_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    movb 4(%ecx), %cl
-; X86-NEXT:    xorb 4(%eax), %cl
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length8(ptr %X, ptr %Y) nounwind optsize {
-; X86-LABEL: length8:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB11_2
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 4(%esi), %ecx
-; X86-NEXT:    movl 4(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB11_3
-; X86-NEXT:  .LBB11_2: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl $1, %eax
-; X86-NEXT:  .LBB11_3: # %endblock
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 8) nounwind
-  ret i32 %m
-}
-
-define i1 @length8_eq(ptr %X, ptr %Y) nounwind optsize {
-; X86-LABEL: length8_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %ecx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 4(%eax), %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 8) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length8_eq_const(ptr %X) nounwind optsize {
-; X86-LABEL: length8_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl $858927408, %ecx # imm = 0x33323130
-; X86-NEXT:    xorl (%eax), %ecx
-; X86-NEXT:    movl $926299444, %edx # imm = 0x37363534
-; X86-NEXT:    xorl 4(%eax), %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 8) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length12_eq(ptr %X, ptr %Y) nounwind optsize {
-; X86-LABEL: length12_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $12
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 12) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length12(ptr %X, ptr %Y) nounwind optsize {
-; X86-LABEL: length12:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $12
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 12) nounwind
-  ret i32 %m
-}
-
-; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
-
-define i32 @length16(ptr %X, ptr %Y) nounwind optsize {
-; X86-LABEL: length16:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $16
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 16) nounwind
-  ret i32 %m
-}
-
-define i1 @length16_eq(ptr %x, ptr %y) nounwind optsize {
-; X86-NOSSE-LABEL: length16_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $16
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length16_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
-; X86-SSE2-NEXT:    pmovmskb %xmm1, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 16) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_eq_const(ptr %X) nounwind optsize {
-; X86-NOSSE-LABEL: length16_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $16
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length16_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 16) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
-
-define i32 @length24(ptr %X, ptr %Y) nounwind optsize {
-; X86-LABEL: length24:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $24
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 24) nounwind
-  ret i32 %m
-}
-
-define i1 @length24_eq(ptr %x, ptr %y) nounwind optsize {
-; X86-NOSSE-LABEL: length24_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $24
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length24_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 8(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 24) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_eq_const(ptr %X) nounwind optsize {
-; X86-NOSSE-LABEL: length24_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $24
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length24_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 24) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length32(ptr %X, ptr %Y) nounwind optsize {
-; X86-LABEL: length32:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 32) nounwind
-  ret i32 %m
-}
-
-; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
-
-define i1 @length32_eq(ptr %x, ptr %y) nounwind optsize {
-; X86-NOSSE-LABEL: length32_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $32
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length32_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 32) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_eq_const(ptr %X) nounwind optsize {
-; X86-NOSSE-LABEL: length32_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $32
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length32_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 32) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length64(ptr %X, ptr %Y) nounwind optsize {
-; X86-LABEL: length64:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 64) nounwind
-  ret i32 %m
-}
-
-define i1 @length64_eq(ptr %x, ptr %y) nounwind optsize {
-; X86-LABEL: length64_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 64) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_eq_const(ptr %X) nounwind optsize {
-; X86-LABEL: length64_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 64) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @bcmp_length2(ptr %X, ptr %Y) nounwind optsize {
-; X86-LABEL: bcmp_length2:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpw (%ecx), %dx
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @bcmp(ptr %X, ptr %Y, i32 2) nounwind
-  ret i32 %m
-}
diff --git a/llvm/test/CodeGen/X86/memcmp-optsize.ll b/llvm/test/CodeGen/X86/memcmp-optsize.ll
deleted file mode 100644
index c0c7b98d471cd4..00000000000000
--- a/llvm/test/CodeGen/X86/memcmp-optsize.ll
+++ /dev/null
@@ -1,596 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
-
-; This tests codegen time inlining/optimization of memcmp
-; rdar://6480398
-
- at .str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1
-
-declare dso_local i32 @memcmp(ptr, ptr, i64)
-declare dso_local i32 @bcmp(ptr, ptr, i64)
-
-define i32 @length2(ptr %X, ptr %Y) nounwind optsize {
-; X64-LABEL: length2:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    movzwl (%rsi), %ecx
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    rolw $8, %cx
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    movzwl %cx, %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
-  ret i32 %m
-}
-
-define i1 @length2_eq(ptr %X, ptr %Y) nounwind optsize {
-; X64-LABEL: length2_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    cmpw (%rsi), %ax
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_const(ptr %X) nounwind optsize {
-; X64-LABEL: length2_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    cmpl $12849, %eax # imm = 0x3231
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind optsize {
-; X64-LABEL: length2_eq_nobuiltin_attr:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $2, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind nobuiltin
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length3(ptr %X, ptr %Y) nounwind optsize {
-; X64-LABEL: length3:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %ecx
-; X64-NEXT:    movzwl (%rsi), %edx
-; X64-NEXT:    rolw $8, %cx
-; X64-NEXT:    rolw $8, %dx
-; X64-NEXT:    cmpw %dx, %cx
-; X64-NEXT:    jne .LBB4_3
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movzbl 2(%rdi), %eax
-; X64-NEXT:    movzbl 2(%rsi), %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB4_3: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpw %dx, %cx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
-  ret i32 %m
-}
-
-define i1 @length3_eq(ptr %X, ptr %Y) nounwind optsize {
-; X64-LABEL: length3_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    xorw (%rsi), %ax
-; X64-NEXT:    movb 2(%rdi), %cl
-; X64-NEXT:    xorb 2(%rsi), %cl
-; X64-NEXT:    movzbl %cl, %ecx
-; X64-NEXT:    orw %ax, %cx
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length4(ptr %X, ptr %Y) nounwind optsize {
-; X64-LABEL: length4:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %ecx
-; X64-NEXT:    movl (%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    seta %al
-; X64-NEXT:    sbbl $0, %eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
-  ret i32 %m
-}
-
-define i1 @length4_eq(ptr %X, ptr %Y) nounwind optsize {
-; X64-LABEL: length4_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    cmpl (%rsi), %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length4_eq_const(ptr %X) nounwind optsize {
-; X64-LABEL: length4_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    cmpl $875770417, (%rdi) # imm = 0x34333231
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i64 4) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length5(ptr %X, ptr %Y) nounwind optsize {
-; X64-LABEL: length5:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %ecx
-; X64-NEXT:    movl (%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    jne .LBB9_3
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movzbl 4(%rdi), %eax
-; X64-NEXT:    movzbl 4(%rsi), %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB9_3: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
-  ret i32 %m
-}
-
-define i1 @length5_eq(ptr %X, ptr %Y) nounwind optsize {
-; X64-LABEL: length5_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    xorl (%rsi), %eax
-; X64-NEXT:    movb 4(%rdi), %cl
-; X64-NEXT:    xorb 4(%rsi), %cl
-; X64-NEXT:    movzbl %cl, %ecx
-; X64-NEXT:    orl %eax, %ecx
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length8(ptr %X, ptr %Y) nounwind optsize {
-; X64-LABEL: length8:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    seta %al
-; X64-NEXT:    sbbl $0, %eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
-  ret i32 %m
-}
-
-define i1 @length8_eq(ptr %X, ptr %Y) nounwind optsize {
-; X64-LABEL: length8_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    cmpq (%rsi), %rax
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length8_eq_const(ptr %X) nounwind optsize {
-; X64-LABEL: length8_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    movabsq $3978425819141910832, %rax # imm = 0x3736353433323130
-; X64-NEXT:    cmpq %rax, (%rdi)
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 8) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length12_eq(ptr %X, ptr %Y) nounwind optsize {
-; X64-LABEL: length12_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    xorq (%rsi), %rax
-; X64-NEXT:    movl 8(%rdi), %ecx
-; X64-NEXT:    xorl 8(%rsi), %ecx
-; X64-NEXT:    orq %rax, %rcx
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length12(ptr %X, ptr %Y) nounwind optsize {
-; X64-LABEL: length12:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB15_2
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movl 8(%rdi), %ecx
-; X64-NEXT:    movl 8(%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB15_3
-; X64-NEXT:  .LBB15_2: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB15_3: # %endblock
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
-  ret i32 %m
-}
-
-; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
-
-define i32 @length16(ptr %X, ptr %Y) nounwind optsize {
-; X64-LABEL: length16:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB16_2
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rcx
-; X64-NEXT:    movq 8(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB16_3
-; X64-NEXT:  .LBB16_2: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB16_3: # %endblock
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 16) nounwind
-  ret i32 %m
-}
-
-define i1 @length16_eq(ptr %x, ptr %y) nounwind optsize {
-; X64-SSE2-LABEL: length16_eq:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
-; X64-SSE2-NEXT:    pmovmskb %xmm1, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX-LABEL: length16_eq:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    setne %al
-; X64-AVX-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_eq_const(ptr %X) nounwind optsize {
-; X64-SSE2-LABEL: length16_eq_const:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX-LABEL: length16_eq_const:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    sete %al
-; X64-AVX-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 16) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
-
-define i32 @length24(ptr %X, ptr %Y) nounwind optsize {
-; X64-LABEL: length24:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $24, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 24) nounwind
-  ret i32 %m
-}
-
-define i1 @length24_eq(ptr %x, ptr %y) nounwind optsize {
-; X64-SSE2-LABEL: length24_eq:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
-; X64-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X64-SSE2-NEXT:    pand %xmm1, %xmm2
-; X64-SSE2-NEXT:    pmovmskb %xmm2, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX-LABEL: length24_eq:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; X64-AVX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; X64-AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
-; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
-; X64-AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    sete %al
-; X64-AVX-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_eq_const(ptr %X) nounwind optsize {
-; X64-SSE2-LABEL: length24_eq_const:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT:    pand %xmm1, %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX-LABEL: length24_eq_const:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    setne %al
-; X64-AVX-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 24) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length32(ptr %X, ptr %Y) nounwind optsize {
-; X64-LABEL: length32:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $32, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 32) nounwind
-  ret i32 %m
-}
-
-; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
-
-define i1 @length32_eq(ptr %x, ptr %y) nounwind optsize {
-; X64-SSE2-LABEL: length32_eq:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm2
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X64-SSE2-NEXT:    movdqu 16(%rsi), %xmm0
-; X64-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X64-SSE2-NEXT:    pand %xmm2, %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX1-LABEL: length32_eq:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vxorps (%rsi), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    sete %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length32_eq:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_eq_const(ptr %X) nounwind optsize {
-; X64-SSE2-LABEL: length32_eq_const:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT:    pand %xmm1, %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX1-LABEL: length32_eq_const:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    setne %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length32_eq_const:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 32) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length64(ptr %X, ptr %Y) nounwind optsize {
-; X64-LABEL: length64:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $64, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 64) nounwind
-  ret i32 %m
-}
-
-define i1 @length64_eq(ptr %x, ptr %y) nounwind optsize {
-; X64-SSE2-LABEL: length64_eq:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    pushq %rax
-; X64-SSE2-NEXT:    movl $64, %edx
-; X64-SSE2-NEXT:    callq memcmp
-; X64-SSE2-NEXT:    testl %eax, %eax
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    popq %rcx
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX1-LABEL: length64_eq:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
-; X64-AVX1-NEXT:    vxorps 32(%rsi), %ymm1, %ymm1
-; X64-AVX1-NEXT:    vxorps (%rsi), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    setne %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length64_eq:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-AVX2-NEXT:    vpxor 32(%rsi), %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_eq_const(ptr %X) nounwind optsize {
-; X64-SSE2-LABEL: length64_eq_const:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    pushq %rax
-; X64-SSE2-NEXT:    movl $.L.str, %esi
-; X64-SSE2-NEXT:    movl $64, %edx
-; X64-SSE2-NEXT:    callq memcmp
-; X64-SSE2-NEXT:    testl %eax, %eax
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    popq %rcx
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX1-LABEL: length64_eq_const:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    sete %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length64_eq_const:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 64) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @bcmp_length2(ptr %X, ptr %Y) nounwind optsize {
-; X64-LABEL: bcmp_length2:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %ecx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpw (%rsi), %cx
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @bcmp(ptr %X, ptr %Y, i64 2) nounwind
-  ret i32 %m
-}
diff --git a/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll b/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll
deleted file mode 100644
index cb45fd3ebb9068..00000000000000
--- a/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll
+++ /dev/null
@@ -1,600 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2
-
-; This tests codegen time inlining/optimization of memcmp
-; rdar://6480398
-
- at .str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1
-
-declare dso_local i32 @memcmp(ptr, ptr, i32)
-declare dso_local i32 @bcmp(ptr, ptr, i32)
-
-define i32 @length2(ptr %X, ptr %Y) nounwind !prof !14 {
-; X86-LABEL: length2:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    movzwl %dx, %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
-  ret i32 %m
-}
-
-define i1 @length2_eq(ptr %X, ptr %Y) nounwind !prof !14 {
-; X86-LABEL: length2_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    cmpw (%eax), %cx
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_const(ptr %X) nounwind !prof !14 {
-; X86-LABEL: length2_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    cmpl $12849, %eax # imm = 0x3231
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i32 2) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind !prof !14 {
-; X86-LABEL: length2_eq_nobuiltin_attr:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $2
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind nobuiltin
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length3(ptr %X, ptr %Y) nounwind !prof !14 {
-; X86-LABEL: length3:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    movzwl (%ecx), %esi
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    rolw $8, %si
-; X86-NEXT:    cmpw %si, %dx
-; X86-NEXT:    jne .LBB4_3
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 2(%eax), %eax
-; X86-NEXT:    movzbl 2(%ecx), %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    jmp .LBB4_2
-; X86-NEXT:  .LBB4_3: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpw %si, %dx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl $1, %eax
-; X86-NEXT:  .LBB4_2: # %endblock
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind
-  ret i32 %m
-}
-
-define i1 @length3_eq(ptr %X, ptr %Y) nounwind !prof !14 {
-; X86-LABEL: length3_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %edx
-; X86-NEXT:    xorw (%eax), %dx
-; X86-NEXT:    movb 2(%ecx), %cl
-; X86-NEXT:    xorb 2(%eax), %cl
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    orw %dx, %ax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length4(ptr %X, ptr %Y) nounwind !prof !14 {
-; X86-LABEL: length4:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    seta %al
-; X86-NEXT:    sbbl $0, %eax
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
-  ret i32 %m
-}
-
-define i1 @length4_eq(ptr %X, ptr %Y) nounwind !prof !14 {
-; X86-LABEL: length4_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    cmpl (%eax), %ecx
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length4_eq_const(ptr %X) nounwind !prof !14 {
-; X86-LABEL: length4_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl $875770417, (%eax) # imm = 0x34333231
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i32 4) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length5(ptr %X, ptr %Y) nounwind !prof !14 {
-; X86-LABEL: length5:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    movl (%ecx), %esi
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    bswapl %esi
-; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    jne .LBB9_3
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 4(%eax), %eax
-; X86-NEXT:    movzbl 4(%ecx), %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    jmp .LBB9_2
-; X86-NEXT:  .LBB9_3: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl $1, %eax
-; X86-NEXT:  .LBB9_2: # %endblock
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
-  ret i32 %m
-}
-
-define i1 @length5_eq(ptr %X, ptr %Y) nounwind !prof !14 {
-; X86-LABEL: length5_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    movb 4(%ecx), %cl
-; X86-NEXT:    xorb 4(%eax), %cl
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length8(ptr %X, ptr %Y) nounwind !prof !14 {
-; X86-LABEL: length8:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB11_2
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 4(%esi), %ecx
-; X86-NEXT:    movl 4(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB11_3
-; X86-NEXT:  .LBB11_2: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl $1, %eax
-; X86-NEXT:  .LBB11_3: # %endblock
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 8) nounwind
-  ret i32 %m
-}
-
-define i1 @length8_eq(ptr %X, ptr %Y) nounwind !prof !14 {
-; X86-LABEL: length8_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %ecx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 4(%eax), %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 8) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length8_eq_const(ptr %X) nounwind !prof !14 {
-; X86-LABEL: length8_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl $858927408, %ecx # imm = 0x33323130
-; X86-NEXT:    xorl (%eax), %ecx
-; X86-NEXT:    movl $926299444, %edx # imm = 0x37363534
-; X86-NEXT:    xorl 4(%eax), %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 8) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length12_eq(ptr %X, ptr %Y) nounwind !prof !14 {
-; X86-LABEL: length12_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $12
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 12) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length12(ptr %X, ptr %Y) nounwind !prof !14 {
-; X86-LABEL: length12:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $12
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 12) nounwind
-  ret i32 %m
-}
-
-; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
-
-define i32 @length16(ptr %X, ptr %Y) nounwind !prof !14 {
-; X86-LABEL: length16:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $16
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 16) nounwind
-  ret i32 %m
-}
-
-define i1 @length16_eq(ptr %x, ptr %y) nounwind !prof !14 {
-; X86-NOSSE-LABEL: length16_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $16
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length16_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
-; X86-SSE2-NEXT:    pmovmskb %xmm1, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 16) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_eq_const(ptr %X) nounwind !prof !14 {
-; X86-NOSSE-LABEL: length16_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $16
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length16_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 16) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
-
-define i32 @length24(ptr %X, ptr %Y) nounwind !prof !14 {
-; X86-LABEL: length24:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $24
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 24) nounwind
-  ret i32 %m
-}
-
-define i1 @length24_eq(ptr %x, ptr %y) nounwind !prof !14 {
-; X86-NOSSE-LABEL: length24_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $24
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length24_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 8(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 24) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_eq_const(ptr %X) nounwind !prof !14 {
-; X86-NOSSE-LABEL: length24_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $24
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length24_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 24) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length32(ptr %X, ptr %Y) nounwind !prof !14 {
-; X86-LABEL: length32:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 32) nounwind
-  ret i32 %m
-}
-
-; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
-
-define i1 @length32_eq(ptr %x, ptr %y) nounwind !prof !14 {
-; X86-NOSSE-LABEL: length32_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $32
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length32_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 32) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_eq_const(ptr %X) nounwind !prof !14 {
-; X86-NOSSE-LABEL: length32_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $32
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length32_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 32) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length64(ptr %X, ptr %Y) nounwind !prof !14 {
-; X86-LABEL: length64:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 64) nounwind
-  ret i32 %m
-}
-
-define i1 @length64_eq(ptr %x, ptr %y) nounwind !prof !14 {
-; X86-LABEL: length64_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 64) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_eq_const(ptr %X) nounwind !prof !14 {
-; X86-LABEL: length64_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 64) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @bcmp_length2(ptr %X, ptr %Y) nounwind !prof !14 {
-; X86-LABEL: bcmp_length2:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpw (%ecx), %dx
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @bcmp(ptr %X, ptr %Y, i32 2) nounwind
-  ret i32 %m
-}
-
-!llvm.module.flags = !{!0}
-!0 = !{i32 1, !"ProfileSummary", !1}
-!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
-!2 = !{!"ProfileFormat", !"InstrProf"}
-!3 = !{!"TotalCount", i32 10000}
-!4 = !{!"MaxCount", i32 10}
-!5 = !{!"MaxInternalCount", i32 1}
-!6 = !{!"MaxFunctionCount", i32 1000}
-!7 = !{!"NumCounts", i32 3}
-!8 = !{!"NumFunctions", i32 3}
-!9 = !{!"DetailedSummary", !10}
-!10 = !{!11, !12, !13}
-!11 = !{i32 10000, i32 100, i32 1}
-!12 = !{i32 999000, i32 100, i32 1}
-!13 = !{i32 999999, i32 1, i32 2}
-!14 = !{!"function_entry_count", i32 0}
diff --git a/llvm/test/CodeGen/X86/memcmp-pgso.ll b/llvm/test/CodeGen/X86/memcmp-pgso.ll
deleted file mode 100644
index 720344a22e43b5..00000000000000
--- a/llvm/test/CodeGen/X86/memcmp-pgso.ll
+++ /dev/null
@@ -1,613 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
-
-; This tests codegen time inlining/optimization of memcmp
-; rdar://6480398
-
- at .str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1
-
-declare dso_local i32 @memcmp(ptr, ptr, i64)
-declare dso_local i32 @bcmp(ptr, ptr, i64)
-
-define i32 @length2(ptr %X, ptr %Y) nounwind !prof !14 {
-; X64-LABEL: length2:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    movzwl (%rsi), %ecx
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    rolw $8, %cx
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    movzwl %cx, %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
-  ret i32 %m
-}
-
-define i1 @length2_eq(ptr %X, ptr %Y) nounwind !prof !14 {
-; X64-LABEL: length2_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    cmpw (%rsi), %ax
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_const(ptr %X) nounwind !prof !14 {
-; X64-LABEL: length2_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    cmpl $12849, %eax # imm = 0x3231
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind !prof !14 {
-; X64-LABEL: length2_eq_nobuiltin_attr:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $2, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind nobuiltin
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length3(ptr %X, ptr %Y) nounwind !prof !14 {
-; X64-LABEL: length3:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %ecx
-; X64-NEXT:    movzwl (%rsi), %edx
-; X64-NEXT:    rolw $8, %cx
-; X64-NEXT:    rolw $8, %dx
-; X64-NEXT:    cmpw %dx, %cx
-; X64-NEXT:    jne .LBB4_3
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movzbl 2(%rdi), %eax
-; X64-NEXT:    movzbl 2(%rsi), %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB4_3: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpw %dx, %cx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
-  ret i32 %m
-}
-
-define i1 @length3_eq(ptr %X, ptr %Y) nounwind !prof !14 {
-; X64-LABEL: length3_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    xorw (%rsi), %ax
-; X64-NEXT:    movb 2(%rdi), %cl
-; X64-NEXT:    xorb 2(%rsi), %cl
-; X64-NEXT:    movzbl %cl, %ecx
-; X64-NEXT:    orw %ax, %cx
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length4(ptr %X, ptr %Y) nounwind !prof !14 {
-; X64-LABEL: length4:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %ecx
-; X64-NEXT:    movl (%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    seta %al
-; X64-NEXT:    sbbl $0, %eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
-  ret i32 %m
-}
-
-define i1 @length4_eq(ptr %X, ptr %Y) nounwind !prof !14 {
-; X64-LABEL: length4_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    cmpl (%rsi), %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length4_eq_const(ptr %X) nounwind !prof !14 {
-; X64-LABEL: length4_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    cmpl $875770417, (%rdi) # imm = 0x34333231
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i64 4) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length5(ptr %X, ptr %Y) nounwind !prof !14 {
-; X64-LABEL: length5:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %ecx
-; X64-NEXT:    movl (%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    jne .LBB9_3
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movzbl 4(%rdi), %eax
-; X64-NEXT:    movzbl 4(%rsi), %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB9_3: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
-  ret i32 %m
-}
-
-define i1 @length5_eq(ptr %X, ptr %Y) nounwind !prof !14 {
-; X64-LABEL: length5_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    xorl (%rsi), %eax
-; X64-NEXT:    movb 4(%rdi), %cl
-; X64-NEXT:    xorb 4(%rsi), %cl
-; X64-NEXT:    movzbl %cl, %ecx
-; X64-NEXT:    orl %eax, %ecx
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length8(ptr %X, ptr %Y) nounwind !prof !14 {
-; X64-LABEL: length8:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    seta %al
-; X64-NEXT:    sbbl $0, %eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
-  ret i32 %m
-}
-
-define i1 @length8_eq(ptr %X, ptr %Y) nounwind !prof !14 {
-; X64-LABEL: length8_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    cmpq (%rsi), %rax
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length8_eq_const(ptr %X) nounwind !prof !14 {
-; X64-LABEL: length8_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    movabsq $3978425819141910832, %rax # imm = 0x3736353433323130
-; X64-NEXT:    cmpq %rax, (%rdi)
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 8) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length12_eq(ptr %X, ptr %Y) nounwind !prof !14 {
-; X64-LABEL: length12_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    xorq (%rsi), %rax
-; X64-NEXT:    movl 8(%rdi), %ecx
-; X64-NEXT:    xorl 8(%rsi), %ecx
-; X64-NEXT:    orq %rax, %rcx
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length12(ptr %X, ptr %Y) nounwind !prof !14 {
-; X64-LABEL: length12:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB15_2
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movl 8(%rdi), %ecx
-; X64-NEXT:    movl 8(%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB15_3
-; X64-NEXT:  .LBB15_2: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB15_3: # %endblock
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
-  ret i32 %m
-}
-
-; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
-
-define i32 @length16(ptr %X, ptr %Y) nounwind !prof !14 {
-; X64-LABEL: length16:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB16_2
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rcx
-; X64-NEXT:    movq 8(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB16_3
-; X64-NEXT:  .LBB16_2: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB16_3: # %endblock
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 16) nounwind
-  ret i32 %m
-}
-
-define i1 @length16_eq(ptr %x, ptr %y) nounwind !prof !14 {
-; X64-SSE2-LABEL: length16_eq:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
-; X64-SSE2-NEXT:    pmovmskb %xmm1, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX-LABEL: length16_eq:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    setne %al
-; X64-AVX-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_eq_const(ptr %X) nounwind !prof !14 {
-; X64-SSE2-LABEL: length16_eq_const:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX-LABEL: length16_eq_const:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    sete %al
-; X64-AVX-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 16) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
-
-define i32 @length24(ptr %X, ptr %Y) nounwind !prof !14 {
-; X64-LABEL: length24:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $24, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 24) nounwind
-  ret i32 %m
-}
-
-define i1 @length24_eq(ptr %x, ptr %y) nounwind !prof !14 {
-; X64-SSE2-LABEL: length24_eq:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
-; X64-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X64-SSE2-NEXT:    pand %xmm1, %xmm2
-; X64-SSE2-NEXT:    pmovmskb %xmm2, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX-LABEL: length24_eq:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; X64-AVX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; X64-AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
-; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
-; X64-AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    sete %al
-; X64-AVX-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_eq_const(ptr %X) nounwind !prof !14 {
-; X64-SSE2-LABEL: length24_eq_const:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT:    pand %xmm1, %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX-LABEL: length24_eq_const:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    setne %al
-; X64-AVX-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 24) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length32(ptr %X, ptr %Y) nounwind !prof !14 {
-; X64-LABEL: length32:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $32, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 32) nounwind
-  ret i32 %m
-}
-
-; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
-
-define i1 @length32_eq(ptr %x, ptr %y) nounwind !prof !14 {
-; X64-SSE2-LABEL: length32_eq:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm2
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X64-SSE2-NEXT:    movdqu 16(%rsi), %xmm0
-; X64-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X64-SSE2-NEXT:    pand %xmm2, %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX1-LABEL: length32_eq:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vxorps (%rsi), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    sete %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length32_eq:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_eq_const(ptr %X) nounwind !prof !14 {
-; X64-SSE2-LABEL: length32_eq_const:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT:    pand %xmm1, %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX1-LABEL: length32_eq_const:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    setne %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length32_eq_const:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 32) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length64(ptr %X, ptr %Y) nounwind !prof !14 {
-; X64-LABEL: length64:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $64, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 64) nounwind
-  ret i32 %m
-}
-
-define i1 @length64_eq(ptr %x, ptr %y) nounwind !prof !14 {
-; X64-SSE2-LABEL: length64_eq:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    pushq %rax
-; X64-SSE2-NEXT:    movl $64, %edx
-; X64-SSE2-NEXT:    callq memcmp
-; X64-SSE2-NEXT:    testl %eax, %eax
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    popq %rcx
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX1-LABEL: length64_eq:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
-; X64-AVX1-NEXT:    vxorps 32(%rsi), %ymm1, %ymm1
-; X64-AVX1-NEXT:    vxorps (%rsi), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    setne %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length64_eq:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-AVX2-NEXT:    vpxor 32(%rsi), %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_eq_const(ptr %X) nounwind !prof !14 {
-; X64-SSE2-LABEL: length64_eq_const:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    pushq %rax
-; X64-SSE2-NEXT:    movl $.L.str, %esi
-; X64-SSE2-NEXT:    movl $64, %edx
-; X64-SSE2-NEXT:    callq memcmp
-; X64-SSE2-NEXT:    testl %eax, %eax
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    popq %rcx
-; X64-SSE2-NEXT:    retq
-;
-; X64-AVX1-LABEL: length64_eq_const:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    sete %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length64_eq_const:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 64) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @bcmp_length2(ptr %X, ptr %Y) nounwind !prof !14 {
-; X64-LABEL: bcmp_length2:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %ecx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpw (%rsi), %cx
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @bcmp(ptr %X, ptr %Y, i64 2) nounwind
-  ret i32 %m
-}
-
-!llvm.module.flags = !{!0}
-!0 = !{i32 1, !"ProfileSummary", !1}
-!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
-!2 = !{!"ProfileFormat", !"InstrProf"}
-!3 = !{!"TotalCount", i64 10000}
-!4 = !{!"MaxCount", i64 10}
-!5 = !{!"MaxInternalCount", i64 1}
-!6 = !{!"MaxFunctionCount", i64 1000}
-!7 = !{!"NumCounts", i64 3}
-!8 = !{!"NumFunctions", i64 3}
-!9 = !{!"DetailedSummary", !10}
-!10 = !{!11, !12, !13}
-!11 = !{i32 10000, i64 100, i32 1}
-!12 = !{i32 999000, i64 100, i32 1}
-!13 = !{i32 999999, i64 1, i32 2}
-!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/CodeGen/X86/memcmp-x32.ll b/llvm/test/CodeGen/X86/memcmp-x32.ll
deleted file mode 100644
index a63402cea20962..00000000000000
--- a/llvm/test/CodeGen/X86/memcmp-x32.ll
+++ /dev/null
@@ -1,2429 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov     | FileCheck %s --check-prefixes=X86,X86-NOSSE
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse     | FileCheck %s --check-prefixes=X86,X86-SSE1
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2    | FileCheck %s --check-prefixes=X86,X86-SSE2
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1  | FileCheck %s --check-prefixes=X86,X86-SSE41
-
-; This tests codegen time inlining/optimization of memcmp
-; rdar://6480398
-
- at .str = private constant [513 x i8] c"01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901\00", align 1
-
-declare dso_local i32 @memcmp(ptr, ptr, i32)
-
-define i32 @length0(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length0:
-; X86:       # %bb.0:
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    retl
-   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 0) nounwind
-   ret i32 %m
- }
-
-define i1 @length0_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length0_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movb $1, %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 0) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length0_lt(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length0_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 0) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length2(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length2:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    movzwl %dx, %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
-  ret i32 %m
-}
-
-define i32 @length2_const(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length2_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    rolw $8, %ax
-; X86-NEXT:    movzwl %ax, %eax
-; X86-NEXT:    addl $-12594, %eax # imm = 0xCECE
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 2) nounwind
-  ret i32 %m
-}
-
-define i1 @length2_gt_const(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length2_gt_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    rolw $8, %ax
-; X86-NEXT:    movzwl %ax, %eax
-; X86-NEXT:    addl $-12594, %eax # imm = 0xCECE
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 2) nounwind
-  %c = icmp sgt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length2_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    cmpw (%eax), %cx
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length2_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    movzwl %dx, %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_gt(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length2_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %ax
-; X86-NEXT:    movzwl %cx, %ecx
-; X86-NEXT:    movzwl %ax, %eax
-; X86-NEXT:    subl %eax, %ecx
-; X86-NEXT:    testl %ecx, %ecx
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
-  %c = icmp sgt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_const(ptr %X) nounwind {
-; X86-LABEL: length2_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    cmpl $12849, %eax # imm = 0x3231
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 2) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length2_eq_nobuiltin_attr:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $2
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind nobuiltin
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length3(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length3:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    movzwl (%ecx), %esi
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    rolw $8, %si
-; X86-NEXT:    cmpw %si, %dx
-; X86-NEXT:    jne .LBB11_3
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 2(%eax), %eax
-; X86-NEXT:    movzbl 2(%ecx), %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB11_3: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpw %si, %dx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl $1, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind
-  ret i32 %m
-}
-
-define i1 @length3_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length3_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %edx
-; X86-NEXT:    xorw (%eax), %dx
-; X86-NEXT:    movzbl 2(%ecx), %ecx
-; X86-NEXT:    xorb 2(%eax), %cl
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    orw %dx, %ax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length4(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length4:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    seta %al
-; X86-NEXT:    sbbl $0, %eax
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
-  ret i32 %m
-}
-
-define i1 @length4_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length4_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    cmpl (%eax), %ecx
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length4_lt(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length4_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    movl (%eax), %eax
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    cmpl %eax, %ecx
-; X86-NEXT:    setb %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length4_gt(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length4_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    movl (%eax), %eax
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    cmpl %eax, %ecx
-; X86-NEXT:    seta %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
-  %c = icmp sgt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length4_eq_const(ptr %X) nounwind {
-; X86-LABEL: length4_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl $875770417, (%eax) # imm = 0x34333231
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 4) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length5(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length5:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    movl (%ecx), %esi
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    bswapl %esi
-; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    jne .LBB18_3
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 4(%eax), %eax
-; X86-NEXT:    movzbl 4(%ecx), %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB18_3: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl $1, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
-  ret i32 %m
-}
-
-define i1 @length5_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length5_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    movzbl 4(%ecx), %ecx
-; X86-NEXT:    xorb 4(%eax), %cl
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length5_lt(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length5_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    movl (%ecx), %esi
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    bswapl %esi
-; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    jne .LBB20_3
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 4(%eax), %eax
-; X86-NEXT:    movzbl 4(%ecx), %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    jmp .LBB20_2
-; X86-NEXT:  .LBB20_3: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl $1, %eax
-; X86-NEXT:  .LBB20_2: # %endblock
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length7(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length7:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB21_2
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 3(%esi), %ecx
-; X86-NEXT:    movl 3(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB21_3
-; X86-NEXT:  .LBB21_2: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl $1, %eax
-; X86-NEXT:  .LBB21_3: # %endblock
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 7) nounwind
-  ret i32 %m
-}
-
-define i1 @length7_lt(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length7_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB22_2
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 3(%esi), %ecx
-; X86-NEXT:    movl 3(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB22_3
-; X86-NEXT:  .LBB22_2: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl $1, %eax
-; X86-NEXT:  .LBB22_3: # %endblock
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 7) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length7_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length7_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 3(%ecx), %ecx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 3(%eax), %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 7) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length8(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length8:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB24_2
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 4(%esi), %ecx
-; X86-NEXT:    movl 4(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB24_3
-; X86-NEXT:  .LBB24_2: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    orl $1, %eax
-; X86-NEXT:  .LBB24_3: # %endblock
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 8) nounwind
-  ret i32 %m
-}
-
-define i1 @length8_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length8_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %ecx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 4(%eax), %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 8) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length8_eq_const(ptr %X) nounwind {
-; X86-LABEL: length8_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl $858927408, %ecx # imm = 0x33323130
-; X86-NEXT:    xorl (%eax), %ecx
-; X86-NEXT:    movl $926299444, %edx # imm = 0x37363534
-; X86-NEXT:    xorl 4(%eax), %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 8) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length9_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length9_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $9
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 9) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length10_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length10_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $10
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 10) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length11_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length11_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $11
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 11) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length12_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length12_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $12
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 12) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length12(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length12:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $12
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 12) nounwind
-  ret i32 %m
-}
-
-define i1 @length13_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length13_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $13
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 13) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length14_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length14_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $14
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 14) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length15(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length15:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $15
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 15) nounwind
-  ret i32 %m
-}
-
-define i1 @length15_lt(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length15_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $15
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 15) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length15_const(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length15_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $15
-; X86-NEXT:    pushl $.L.str+1
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 15) nounwind
-  ret i32 %m
-}
-
-define i1 @length15_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length15_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $15
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 15) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length15_gt_const(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length15_gt_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $15
-; X86-NEXT:    pushl $.L.str+1
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 15) nounwind
-  %c = icmp sgt i32 %m, 0
-  ret i1 %c
-}
-
-; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
-
-define i32 @length16(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length16:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $16
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 16) nounwind
-  ret i32 %m
-}
-
-define i1 @length16_eq(ptr %x, ptr %y) nounwind {
-; X86-NOSSE-LABEL: length16_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $16
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length16_eq:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $16
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    setne %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length16_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
-; X86-SSE2-NEXT:    pmovmskb %xmm1, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length16_eq:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm1
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm1
-; X86-SSE41-NEXT:    ptest %xmm1, %xmm1
-; X86-SSE41-NEXT:    setne %al
-; X86-SSE41-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 16) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length16_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $16
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 16) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length16_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $16
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 16) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_eq_const(ptr %X) nounwind {
-; X86-NOSSE-LABEL: length16_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $16
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length16_eq_const:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $16
-; X86-SSE1-NEXT:    pushl $.L.str
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length16_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length16_eq_const:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 16) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
-
-define i32 @length24(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length24:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $24
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 24) nounwind
-  ret i32 %m
-}
-
-define i1 @length24_eq(ptr %x, ptr %y) nounwind {
-; X86-NOSSE-LABEL: length24_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $24
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length24_eq:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $24
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length24_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 8(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length24_eq:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 8(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 8(%eax), %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 24) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length24_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $24
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 24) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length24_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $24
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 24) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_eq_const(ptr %X) nounwind {
-; X86-NOSSE-LABEL: length24_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $24
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length24_eq_const:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $24
-; X86-SSE1-NEXT:    pushl $.L.str
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    setne %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length24_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length24_eq_const:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE41-NEXT:    movdqu 8(%eax), %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    setne %al
-; X86-SSE41-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 24) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length31(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length31:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $31
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 31) nounwind
-  ret i32 %m
-}
-
-define i1 @length31_eq(ptr %x, ptr %y) nounwind {
-; X86-NOSSE-LABEL: length31_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $31
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length31_eq:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $31
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length31_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 15(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 15(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length31_eq:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 15(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 15(%eax), %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 31) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length31_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length31_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $31
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 31) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length31_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length31_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $31
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 31) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length31_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
-; X86-NOSSE-LABEL: length31_eq_prefer128:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $31
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length31_eq_prefer128:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $31
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length31_eq_prefer128:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 15(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 15(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length31_eq_prefer128:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 15(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 15(%eax), %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 31) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length31_eq_const(ptr %X) nounwind {
-; X86-NOSSE-LABEL: length31_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $31
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length31_eq_const:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $31
-; X86-SSE1-NEXT:    pushl $.L.str
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    setne %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length31_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    movdqu 15(%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length31_eq_const:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE41-NEXT:    movdqu 15(%eax), %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    setne %al
-; X86-SSE41-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 31) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length32(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length32:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 32) nounwind
-  ret i32 %m
-}
-
-; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
-
-define i1 @length32_eq(ptr %x, ptr %y) nounwind {
-; X86-NOSSE-LABEL: length32_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $32
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length32_eq:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $32
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length32_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length32_eq:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 32) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length32_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 32) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length32_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 32) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
-; X86-NOSSE-LABEL: length32_eq_prefer128:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $32
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length32_eq_prefer128:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $32
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length32_eq_prefer128:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length32_eq_prefer128:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 32) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_eq_const(ptr %X) nounwind {
-; X86-NOSSE-LABEL: length32_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $32
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $12, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length32_eq_const:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $32
-; X86-SSE1-NEXT:    pushl $.L.str
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $12, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    setne %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length32_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length32_eq_const:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    setne %al
-; X86-SSE41-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 32) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length48(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length48:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $48
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 48) nounwind
-  ret i32 %m
-}
-
-define i1 @length48_eq(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length48_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $48
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 48) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length48_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length48_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $48
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 48) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length48_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length48_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $48
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 48) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length48_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
-; X86-LABEL: length48_eq_prefer128:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $48
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 48) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length48_eq_const(ptr %X) nounwind {
-; X86-LABEL: length48_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $48
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 48) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length63(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length63:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $63
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 63) nounwind
-  ret i32 %m
-}
-
-define i1 @length63_eq(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length63_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $63
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 63) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length63_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length63_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $63
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 63) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length63_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length63_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $63
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 63) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length63_eq_const(ptr %X) nounwind {
-; X86-LABEL: length63_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $63
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 63) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length64(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length64:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 64) nounwind
-  ret i32 %m
-}
-
-define i1 @length64_eq(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length64_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 64) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length64_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 64) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length64_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 64) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_eq_const(ptr %X) nounwind {
-; X86-LABEL: length64_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 64) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length96(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length96:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $96
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 96) nounwind
-  ret i32 %m
-}
-
-define i1 @length96_eq(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length96_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $96
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 96) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length96_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length96_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $96
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 96) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length96_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length96_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $96
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 96) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length96_eq_const(ptr %X) nounwind {
-; X86-LABEL: length96_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $96
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 96) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length127(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length127:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $127
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 127) nounwind
-  ret i32 %m
-}
-
-define i1 @length127_eq(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length127_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $127
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 127) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length127_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length127_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $127
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 127) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length127_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length127_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $127
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 127) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length127_eq_const(ptr %X) nounwind {
-; X86-LABEL: length127_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $127
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 127) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length128(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length128:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $128
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 128) nounwind
-  ret i32 %m
-}
-
-define i1 @length128_eq(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length128_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $128
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 128) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length128_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length128_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $128
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 128) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length128_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length128_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $128
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 128) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length128_eq_const(ptr %X) nounwind {
-; X86-LABEL: length128_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $128
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 128) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length192(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length192:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $192
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 192) nounwind
-  ret i32 %m
-}
-
-define i1 @length192_eq(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length192_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $192
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 192) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length192_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length192_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $192
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 192) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length192_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length192_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $192
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 192) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length192_eq_const(ptr %X) nounwind {
-; X86-LABEL: length192_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $192
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 192) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length255(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length255:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $255
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 255) nounwind
-  ret i32 %m
-}
-
-define i1 @length255_eq(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length255_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $255
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 255) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length255_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length255_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $255
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 255) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length255_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length255_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $255
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 255) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length255_eq_const(ptr %X) nounwind {
-; X86-LABEL: length255_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $255
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 255) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length256(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length256:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $256 # imm = 0x100
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 256) nounwind
-  ret i32 %m
-}
-
-define i1 @length256_eq(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length256_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $256 # imm = 0x100
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 256) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length256_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length256_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $256 # imm = 0x100
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 256) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length256_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length256_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $256 # imm = 0x100
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 256) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length256_eq_const(ptr %X) nounwind {
-; X86-LABEL: length256_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $256 # imm = 0x100
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 256) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length384(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length384:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $384 # imm = 0x180
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 384) nounwind
-  ret i32 %m
-}
-
-define i1 @length384_eq(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length384_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $384 # imm = 0x180
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 384) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length384_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length384_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $384 # imm = 0x180
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 384) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length384_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length384_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $384 # imm = 0x180
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 384) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length384_eq_const(ptr %X) nounwind {
-; X86-LABEL: length384_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $384 # imm = 0x180
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 384) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length511(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length511:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $511 # imm = 0x1FF
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 511) nounwind
-  ret i32 %m
-}
-
-define i1 @length511_eq(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length511_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $511 # imm = 0x1FF
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 511) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length511_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length511_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $511 # imm = 0x1FF
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 511) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length511_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length511_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $511 # imm = 0x1FF
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 511) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length511_eq_const(ptr %X) nounwind {
-; X86-LABEL: length511_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $511 # imm = 0x1FF
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 511) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length512(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: length512:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $512 # imm = 0x200
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 512) nounwind
-  ret i32 %m
-}
-
-define i1 @length512_eq(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length512_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $512 # imm = 0x200
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 512) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length512_lt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length512_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $512 # imm = 0x200
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 512) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length512_gt(ptr %x, ptr %y) nounwind {
-; X86-LABEL: length512_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $512 # imm = 0x200
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 512) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length512_eq_const(ptr %X) nounwind {
-; X86-LABEL: length512_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $512 # imm = 0x200
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 512) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-; This checks that we do not do stupid things with huge sizes.
-define i32 @huge_length(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: huge_length:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $-1
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 9223372036854775807) nounwind
-  ret i32 %m
-}
-
-define i1 @huge_length_eq(ptr %X, ptr %Y) nounwind {
-; X86-LABEL: huge_length_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $-1
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 9223372036854775807) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-; This checks non-constant sizes.
-define i32 @nonconst_length(ptr %X, ptr %Y, i32 %size) nounwind {
-; X86-LABEL: nonconst_length:
-; X86:       # %bb.0:
-; X86-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 %size) nounwind
-  ret i32 %m
-}
-
-define i1 @nonconst_length_eq(ptr %X, ptr %Y, i32 %size) nounwind {
-; X86-LABEL: nonconst_length_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $12, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 %size) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
diff --git a/llvm/test/CodeGen/X86/memcmp.ll b/llvm/test/CodeGen/X86/memcmp.ll
deleted file mode 100644
index f5e7384362a92b..00000000000000
--- a/llvm/test/CodeGen/X86/memcmp.ll
+++ /dev/null
@@ -1,3065 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown               | FileCheck %s --check-prefixes=X64,X64-SSE,X64-SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1 | FileCheck %s --check-prefixes=X64,X64-SSE,X64-SSE41
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx    | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2   | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw,+prefer-256-bit | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw,-prefer-256-bit | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX512,X64-AVX512BW
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f,+prefer-256-bit,-prefer-mask-registers | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f,-prefer-256-bit,-prefer-mask-registers | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX512,X64-AVX512F
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f,+prefer-256-bit,+prefer-mask-registers | FileCheck %s --check-prefixes=X64,X64-MIC-AVX,X64-MIC-AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f,-prefer-256-bit,+prefer-mask-registers | FileCheck %s --check-prefixes=X64,X64-MIC-AVX,X64-MIC-AVX512F
-
-; This tests codegen time inlining/optimization of memcmp
-; rdar://6480398
-
- at .str = private constant [513 x i8] c"01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901\00", align 1
-
-declare dso_local i32 @memcmp(ptr, ptr, i64)
-
-define i32 @length0(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length0:
-; X64:       # %bb.0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    retq
-   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
-   ret i32 %m
- }
-
-define i1 @length0_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length0_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movb $1, %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length0_lt(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length0_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length2(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length2:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    movzwl (%rsi), %ecx
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    rolw $8, %cx
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    movzwl %cx, %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
-  ret i32 %m
-}
-
-define i32 @length2_const(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length2_const:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    addl $-12594, %eax # imm = 0xCECE
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
-  ret i32 %m
-}
-
-define i1 @length2_gt_const(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length2_gt_const:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    addl $-12594, %eax # imm = 0xCECE
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
-  %c = icmp sgt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length2_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    cmpw (%rsi), %ax
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length2_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    movzwl (%rsi), %ecx
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    rolw $8, %cx
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    movzwl %cx, %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_gt(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length2_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    movzwl (%rsi), %ecx
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    rolw $8, %cx
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    movzwl %cx, %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
-  %c = icmp sgt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_const(ptr %X) nounwind {
-; X64-LABEL: length2_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    cmpl $12849, %eax # imm = 0x3231
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length2_eq_nobuiltin_attr:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $2, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind nobuiltin
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length3(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length3:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %ecx
-; X64-NEXT:    movzwl (%rsi), %edx
-; X64-NEXT:    rolw $8, %cx
-; X64-NEXT:    rolw $8, %dx
-; X64-NEXT:    cmpw %dx, %cx
-; X64-NEXT:    jne .LBB11_3
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movzbl 2(%rdi), %eax
-; X64-NEXT:    movzbl 2(%rsi), %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB11_3: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpw %dx, %cx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
-  ret i32 %m
-}
-
-define i1 @length3_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length3_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    xorw (%rsi), %ax
-; X64-NEXT:    movzbl 2(%rdi), %ecx
-; X64-NEXT:    xorb 2(%rsi), %cl
-; X64-NEXT:    movzbl %cl, %ecx
-; X64-NEXT:    orw %ax, %cx
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length4(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length4:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %ecx
-; X64-NEXT:    movl (%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    seta %al
-; X64-NEXT:    sbbl $0, %eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
-  ret i32 %m
-}
-
-define i1 @length4_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length4_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    cmpl (%rsi), %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length4_lt(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length4_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    movl (%rsi), %ecx
-; X64-NEXT:    bswapl %eax
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    cmpl %ecx, %eax
-; X64-NEXT:    setb %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length4_gt(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length4_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    movl (%rsi), %ecx
-; X64-NEXT:    bswapl %eax
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    cmpl %ecx, %eax
-; X64-NEXT:    seta %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
-  %c = icmp sgt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length4_eq_const(ptr %X) nounwind {
-; X64-LABEL: length4_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    cmpl $875770417, (%rdi) # imm = 0x34333231
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 4) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length5(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length5:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %ecx
-; X64-NEXT:    movl (%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    jne .LBB18_3
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movzbl 4(%rdi), %eax
-; X64-NEXT:    movzbl 4(%rsi), %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB18_3: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
-  ret i32 %m
-}
-
-define i1 @length5_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length5_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    xorl (%rsi), %eax
-; X64-NEXT:    movzbl 4(%rdi), %ecx
-; X64-NEXT:    xorb 4(%rsi), %cl
-; X64-NEXT:    movzbl %cl, %ecx
-; X64-NEXT:    orl %eax, %ecx
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length5_lt(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length5_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %ecx
-; X64-NEXT:    movl (%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    jne .LBB20_3
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movzbl 4(%rdi), %eax
-; X64-NEXT:    movzbl 4(%rsi), %ecx
-; X64-NEXT:    subl %ecx, %eax
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB20_3: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length7(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length7:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %ecx
-; X64-NEXT:    movl (%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    jne .LBB21_2
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movl 3(%rdi), %ecx
-; X64-NEXT:    movl 3(%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    je .LBB21_3
-; X64-NEXT:  .LBB21_2: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB21_3: # %endblock
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
-  ret i32 %m
-}
-
-define i1 @length7_lt(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length7_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %ecx
-; X64-NEXT:    movl (%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    jne .LBB22_2
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movl 3(%rdi), %ecx
-; X64-NEXT:    movl 3(%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    je .LBB22_3
-; X64-NEXT:  .LBB22_2: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpl %edx, %ecx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB22_3: # %endblock
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length7_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length7_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    movl 3(%rdi), %ecx
-; X64-NEXT:    xorl (%rsi), %eax
-; X64-NEXT:    xorl 3(%rsi), %ecx
-; X64-NEXT:    orl %eax, %ecx
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length8(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length8:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    seta %al
-; X64-NEXT:    sbbl $0, %eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
-  ret i32 %m
-}
-
-define i1 @length8_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length8_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    cmpq (%rsi), %rax
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length8_eq_const(ptr %X) nounwind {
-; X64-LABEL: length8_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    movabsq $3978425819141910832, %rax # imm = 0x3736353433323130
-; X64-NEXT:    cmpq %rax, (%rdi)
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 8) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length9_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length9_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    xorq (%rsi), %rax
-; X64-NEXT:    movzbl 8(%rdi), %ecx
-; X64-NEXT:    xorb 8(%rsi), %cl
-; X64-NEXT:    movzbl %cl, %ecx
-; X64-NEXT:    orq %rax, %rcx
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length10_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length10_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    xorq (%rsi), %rax
-; X64-NEXT:    movzwl 8(%rdi), %ecx
-; X64-NEXT:    xorw 8(%rsi), %cx
-; X64-NEXT:    movzwl %cx, %ecx
-; X64-NEXT:    orq %rax, %rcx
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 10) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length11_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length11_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    movq 3(%rdi), %rcx
-; X64-NEXT:    xorq (%rsi), %rax
-; X64-NEXT:    xorq 3(%rsi), %rcx
-; X64-NEXT:    orq %rax, %rcx
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 11) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length12_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length12_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    xorq (%rsi), %rax
-; X64-NEXT:    movl 8(%rdi), %ecx
-; X64-NEXT:    xorl 8(%rsi), %ecx
-; X64-NEXT:    orq %rax, %rcx
-; X64-NEXT:    setne %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length12(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length12:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB31_2
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movl 8(%rdi), %ecx
-; X64-NEXT:    movl 8(%rsi), %edx
-; X64-NEXT:    bswapl %ecx
-; X64-NEXT:    bswapl %edx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB31_3
-; X64-NEXT:  .LBB31_2: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB31_3: # %endblock
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
-  ret i32 %m
-}
-
-define i1 @length13_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length13_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    movq 5(%rdi), %rcx
-; X64-NEXT:    xorq (%rsi), %rax
-; X64-NEXT:    xorq 5(%rsi), %rcx
-; X64-NEXT:    orq %rax, %rcx
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 13) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length14_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length14_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    movq 6(%rdi), %rcx
-; X64-NEXT:    xorq (%rsi), %rax
-; X64-NEXT:    xorq 6(%rsi), %rcx
-; X64-NEXT:    orq %rax, %rcx
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 14) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length15(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length15:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB34_2
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 7(%rdi), %rcx
-; X64-NEXT:    movq 7(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB34_3
-; X64-NEXT:  .LBB34_2: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB34_3: # %endblock
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) nounwind
-  ret i32 %m
-}
-
-define i1 @length15_lt(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length15_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB35_2
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 7(%rdi), %rcx
-; X64-NEXT:    movq 7(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB35_3
-; X64-NEXT:  .LBB35_2: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB35_3: # %endblock
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length15_const(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length15_const:
-; X64:       # %bb.0:
-; X64-NEXT:    movabsq $3544952156018063160, %rcx # imm = 0x3132333435363738
-; X64-NEXT:    movq (%rdi), %rdx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rcx, %rdx
-; X64-NEXT:    jne .LBB36_2
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movabsq $4051322327650219061, %rcx # imm = 0x3839303132333435
-; X64-NEXT:    movq 7(%rdi), %rdx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rcx, %rdx
-; X64-NEXT:    je .LBB36_3
-; X64-NEXT:  .LBB36_2: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rcx, %rdx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB36_3: # %endblock
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 15) nounwind
-  ret i32 %m
-}
-
-define i1 @length15_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length15_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    movq 7(%rdi), %rcx
-; X64-NEXT:    xorq (%rsi), %rax
-; X64-NEXT:    xorq 7(%rsi), %rcx
-; X64-NEXT:    orq %rax, %rcx
-; X64-NEXT:    sete %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length15_gt_const(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length15_gt_const:
-; X64:       # %bb.0:
-; X64-NEXT:    movabsq $3544952156018063160, %rax # imm = 0x3132333435363738
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    cmpq %rax, %rcx
-; X64-NEXT:    jne .LBB38_2
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movabsq $4051322327650219061, %rax # imm = 0x3839303132333435
-; X64-NEXT:    movq 7(%rdi), %rcx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    cmpq %rax, %rcx
-; X64-NEXT:    je .LBB38_3
-; X64-NEXT:  .LBB38_2: # %res_block
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    cmpq %rax, %rcx
-; X64-NEXT:    sbbl %edx, %edx
-; X64-NEXT:    orl $1, %edx
-; X64-NEXT:  .LBB38_3: # %endblock
-; X64-NEXT:    testl %edx, %edx
-; X64-NEXT:    setg %al
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 15) nounwind
-  %c = icmp sgt i32 %m, 0
-  ret i1 %c
-}
-
-; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
-
-define i32 @length16(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length16:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB39_2
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rcx
-; X64-NEXT:    movq 8(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB39_3
-; X64-NEXT:  .LBB39_2: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB39_3: # %endblock
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 16) nounwind
-  ret i32 %m
-}
-
-define i1 @length16_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE2-LABEL: length16_eq:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
-; X64-SSE2-NEXT:    pmovmskb %xmm1, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length16_eq:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu (%rsi), %xmm1
-; X64-SSE41-NEXT:    pxor %xmm0, %xmm1
-; X64-SSE41-NEXT:    ptest %xmm1, %xmm1
-; X64-SSE41-NEXT:    setne %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX-LABEL: length16_eq:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    setne %al
-; X64-AVX-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length16_eq:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %xmm1
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k0
-; X64-MIC-AVX-NEXT:    setne %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length16_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    jne .LBB41_2
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rcx
-; X64-NEXT:    movq 8(%rsi), %rdx
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    bswapq %rdx
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    je .LBB41_3
-; X64-NEXT:  .LBB41_2: # %res_block
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    orl $1, %eax
-; X64-NEXT:  .LBB41_3: # %endblock
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length16_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    movq (%rsi), %rcx
-; X64-NEXT:    bswapq %rax
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    jne .LBB42_2
-; X64-NEXT:  # %bb.1: # %loadbb1
-; X64-NEXT:    movq 8(%rdi), %rax
-; X64-NEXT:    movq 8(%rsi), %rcx
-; X64-NEXT:    bswapq %rax
-; X64-NEXT:    bswapq %rcx
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    je .LBB42_3
-; X64-NEXT:  .LBB42_2: # %res_block
-; X64-NEXT:    xorl %edx, %edx
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    sbbl %edx, %edx
-; X64-NEXT:    orl $1, %edx
-; X64-NEXT:  .LBB42_3: # %endblock
-; X64-NEXT:    testl %edx, %edx
-; X64-NEXT:    setg %al
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_eq_const(ptr %X) nounwind {
-; X64-SSE2-LABEL: length16_eq_const:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length16_eq_const:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X64-SSE41-NEXT:    sete %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX-LABEL: length16_eq_const:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    sete %al
-; X64-AVX-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length16_eq_const:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [858927408,926299444,825243960,892613426]
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k0
-; X64-MIC-AVX-NEXT:    sete %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 16) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
-
-define i32 @length24(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length24:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $24, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 24) nounwind
-  ret i32 %m
-}
-
-define i1 @length24_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE2-LABEL: length24_eq:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
-; X64-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X64-SSE2-NEXT:    pand %xmm1, %xmm2
-; X64-SSE2-NEXT:    pmovmskb %xmm2, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length24_eq:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu (%rsi), %xmm1
-; X64-SSE41-NEXT:    pxor %xmm0, %xmm1
-; X64-SSE41-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-SSE41-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
-; X64-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X64-SSE41-NEXT:    por %xmm1, %xmm2
-; X64-SSE41-NEXT:    ptest %xmm2, %xmm2
-; X64-SSE41-NEXT:    sete %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX-LABEL: length24_eq:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; X64-AVX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; X64-AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
-; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
-; X64-AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    sete %al
-; X64-AVX-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length24_eq:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %xmm1
-; X64-MIC-AVX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; X64-MIC-AVX-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm2, %k0
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX-NEXT:    sete %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length24_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $24, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length24_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $24, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_eq_const(ptr %X) nounwind {
-; X64-SSE2-LABEL: length24_eq_const:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT:    pand %xmm1, %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length24_eq_const:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE41-NEXT:    por %xmm1, %xmm0
-; X64-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X64-SSE41-NEXT:    setne %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX-LABEL: length24_eq_const:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    setne %al
-; X64-AVX-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length24_eq_const:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-MIC-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [959985462,858927408,0,0]
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm1, %k0
-; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [858927408,926299444,825243960,892613426]
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX-NEXT:    setne %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 24) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length31(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length31:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $31, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 31) nounwind
-  ret i32 %m
-}
-
-define i1 @length31_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE2-LABEL: length31_eq:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 15(%rdi), %xmm1
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm2
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X64-SSE2-NEXT:    movdqu 15(%rsi), %xmm0
-; X64-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X64-SSE2-NEXT:    pand %xmm2, %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length31_eq:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu 15(%rdi), %xmm1
-; X64-SSE41-NEXT:    movdqu (%rsi), %xmm2
-; X64-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X64-SSE41-NEXT:    movdqu 15(%rsi), %xmm0
-; X64-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X64-SSE41-NEXT:    por %xmm2, %xmm0
-; X64-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X64-SSE41-NEXT:    sete %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX-LABEL: length31_eq:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
-; X64-AVX-NEXT:    vpxor 15(%rsi), %xmm1, %xmm1
-; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
-; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    sete %al
-; X64-AVX-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length31_eq:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-MIC-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
-; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %xmm2
-; X64-MIC-AVX-NEXT:    vmovdqu 15(%rsi), %xmm3
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm1, %k0
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm0, %k1
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX-NEXT:    sete %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length31_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length31_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $31, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length31_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length31_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $31, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length31_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
-; X64-SSE2-LABEL: length31_eq_prefer128:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 15(%rdi), %xmm1
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm2
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X64-SSE2-NEXT:    movdqu 15(%rsi), %xmm0
-; X64-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X64-SSE2-NEXT:    pand %xmm2, %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length31_eq_prefer128:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu 15(%rdi), %xmm1
-; X64-SSE41-NEXT:    movdqu (%rsi), %xmm2
-; X64-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X64-SSE41-NEXT:    movdqu 15(%rsi), %xmm0
-; X64-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X64-SSE41-NEXT:    por %xmm2, %xmm0
-; X64-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X64-SSE41-NEXT:    sete %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX-LABEL: length31_eq_prefer128:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
-; X64-AVX-NEXT:    vpxor 15(%rsi), %xmm1, %xmm1
-; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
-; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    sete %al
-; X64-AVX-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length31_eq_prefer128:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-MIC-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
-; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %xmm2
-; X64-MIC-AVX-NEXT:    vmovdqu 15(%rsi), %xmm3
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm1, %k0
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm0, %k1
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX-NEXT:    sete %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length31_eq_const(ptr %X) nounwind {
-; X64-SSE2-LABEL: length31_eq_const:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 15(%rdi), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT:    pand %xmm1, %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length31_eq_const:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu 15(%rdi), %xmm1
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE41-NEXT:    por %xmm1, %xmm0
-; X64-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X64-SSE41-NEXT:    setne %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX-LABEL: length31_eq_const:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
-; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    setne %al
-; X64-AVX-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length31_eq_const:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-MIC-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
-; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [943142453,842084409,909456435,809056311]
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm1, %k0
-; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [858927408,926299444,825243960,892613426]
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX-NEXT:    setne %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 31) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length32(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length32:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $32, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 32) nounwind
-  ret i32 %m
-}
-
-; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
-
-define i1 @length32_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE2-LABEL: length32_eq:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm2
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X64-SSE2-NEXT:    movdqu 16(%rsi), %xmm0
-; X64-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X64-SSE2-NEXT:    pand %xmm2, %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length32_eq:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE41-NEXT:    movdqu (%rsi), %xmm2
-; X64-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X64-SSE41-NEXT:    movdqu 16(%rsi), %xmm0
-; X64-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X64-SSE41-NEXT:    por %xmm2, %xmm0
-; X64-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X64-SSE41-NEXT:    sete %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX1-LABEL: length32_eq:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vxorps (%rsi), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    sete %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length32_eq:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512-LABEL: length32_eq:
-; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX512-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX512-NEXT:    sete %al
-; X64-AVX512-NEXT:    vzeroupper
-; X64-AVX512-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length32_eq:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %ymm1
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k0
-; X64-MIC-AVX-NEXT:    sete %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length32_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $32, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length32_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $32, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
-; X64-SSE2-LABEL: length32_eq_prefer128:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE2-NEXT:    movdqu (%rsi), %xmm2
-; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X64-SSE2-NEXT:    movdqu 16(%rsi), %xmm0
-; X64-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X64-SSE2-NEXT:    pand %xmm2, %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    sete %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length32_eq_prefer128:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE41-NEXT:    movdqu (%rsi), %xmm2
-; X64-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X64-SSE41-NEXT:    movdqu 16(%rsi), %xmm0
-; X64-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X64-SSE41-NEXT:    por %xmm2, %xmm0
-; X64-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X64-SSE41-NEXT:    sete %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX-LABEL: length32_eq_prefer128:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT:    vmovdqu 16(%rdi), %xmm1
-; X64-AVX-NEXT:    vpxor 16(%rsi), %xmm1, %xmm1
-; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
-; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vptest %xmm0, %xmm0
-; X64-AVX-NEXT:    sete %al
-; X64-AVX-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length32_eq_prefer128:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
-; X64-MIC-AVX-NEXT:    vmovdqu 16(%rdi), %xmm1
-; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %xmm2
-; X64-MIC-AVX-NEXT:    vmovdqu 16(%rsi), %xmm3
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm1, %k0
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm0, %k1
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX-NEXT:    sete %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_eq_const(ptr %X) nounwind {
-; X64-SSE2-LABEL: length32_eq_const:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT:    pand %xmm1, %xmm0
-; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X64-SSE2-NEXT:    setne %al
-; X64-SSE2-NEXT:    retq
-;
-; X64-SSE41-LABEL: length32_eq_const:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT:    movdqu 16(%rdi), %xmm1
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE41-NEXT:    por %xmm1, %xmm0
-; X64-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X64-SSE41-NEXT:    setne %al
-; X64-SSE41-NEXT:    retq
-;
-; X64-AVX1-LABEL: length32_eq_const:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    setne %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length32_eq_const:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512-LABEL: length32_eq_const:
-; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX512-NEXT:    setne %al
-; X64-AVX512-NEXT:    vzeroupper
-; X64-AVX512-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length32_eq_const:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [858927408,926299444,825243960,892613426,959985462,858927408,926299444,825243960]
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k0
-; X64-MIC-AVX-NEXT:    setne %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 32) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length48(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length48:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $48, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 48) nounwind
-  ret i32 %m
-}
-
-define i1 @length48_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE-LABEL: length48_eq:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $48, %edx
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    sete %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length48_eq:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 32(%rdi), %xmm1
-; X64-AVX1-NEXT:    vmovups 32(%rsi), %xmm2
-; X64-AVX1-NEXT:    vxorps (%rsi), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vxorps %ymm2, %ymm1, %ymm1
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    sete %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length48_eq:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %xmm1
-; X64-AVX2-NEXT:    vmovdqu 32(%rsi), %xmm2
-; X64-AVX2-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512-LABEL: length48_eq:
-; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX512-NEXT:    vmovdqu 32(%rdi), %xmm1
-; X64-AVX512-NEXT:    vmovdqu 32(%rsi), %xmm2
-; X64-AVX512-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpxor %ymm2, %ymm1, %ymm1
-; X64-AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX512-NEXT:    sete %al
-; X64-AVX512-NEXT:    vzeroupper
-; X64-AVX512-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length48_eq:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %ymm1
-; X64-MIC-AVX-NEXT:    vmovdqu 32(%rdi), %xmm2
-; X64-MIC-AVX-NEXT:    vmovdqu 32(%rsi), %xmm3
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm2, %k0
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX-NEXT:    sete %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length48_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length48_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $48, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length48_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length48_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $48, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length48_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
-; X64-LABEL: length48_eq_prefer128:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $48, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length48_eq_const(ptr %X) nounwind {
-; X64-SSE-LABEL: length48_eq_const:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $.L.str, %esi
-; X64-SSE-NEXT:    movl $48, %edx
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    setne %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length48_eq_const:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 32(%rdi), %xmm1
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    setne %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length48_eq_const:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %xmm1
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512-LABEL: length48_eq_const:
-; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX512-NEXT:    vmovdqu 32(%rdi), %xmm1
-; X64-AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX512-NEXT:    setne %al
-; X64-AVX512-NEXT:    vzeroupper
-; X64-AVX512-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length48_eq_const:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-MIC-AVX-NEXT:    vmovdqu 32(%rdi), %xmm1
-; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} ymm2 = [892613426,959985462,858927408,926299444,0,0,0,0]
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm1, %k0
-; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [858927408,926299444,825243960,892613426,959985462,858927408,926299444,825243960]
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX-NEXT:    setne %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 48) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length63(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length63:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $63, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 63) nounwind
-  ret i32 %m
-}
-
-define i1 @length63_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE-LABEL: length63_eq:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $63, %edx
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    setne %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length63_eq:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 31(%rdi), %ymm1
-; X64-AVX1-NEXT:    vxorps 31(%rsi), %ymm1, %ymm1
-; X64-AVX1-NEXT:    vxorps (%rsi), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    setne %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length63_eq:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovdqu 31(%rdi), %ymm1
-; X64-AVX2-NEXT:    vpxor 31(%rsi), %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512-LABEL: length63_eq:
-; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX512-NEXT:    vmovdqu 31(%rdi), %ymm1
-; X64-AVX512-NEXT:    vpxor 31(%rsi), %ymm1, %ymm1
-; X64-AVX512-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX512-NEXT:    setne %al
-; X64-AVX512-NEXT:    vzeroupper
-; X64-AVX512-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length63_eq:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-MIC-AVX-NEXT:    vmovdqu 31(%rdi), %ymm1
-; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %ymm2
-; X64-MIC-AVX-NEXT:    vmovdqu 31(%rsi), %ymm3
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm1, %k0
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm0, %k1
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX-NEXT:    setne %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length63_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length63_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $63, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length63_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length63_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $63, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length63_eq_const(ptr %X) nounwind {
-; X64-SSE-LABEL: length63_eq_const:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $.L.str, %esi
-; X64-SSE-NEXT:    movl $63, %edx
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    sete %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length63_eq_const:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 31(%rdi), %ymm1
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    sete %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length63_eq_const:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovdqu 31(%rdi), %ymm1
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512-LABEL: length63_eq_const:
-; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX512-NEXT:    vmovdqu 31(%rdi), %ymm1
-; X64-AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX512-NEXT:    sete %al
-; X64-AVX512-NEXT:    vzeroupper
-; X64-AVX512-NEXT:    retq
-;
-; X64-MIC-AVX-LABEL: length63_eq_const:
-; X64-MIC-AVX:       # %bb.0:
-; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-MIC-AVX-NEXT:    vmovdqu 31(%rdi), %ymm1
-; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} ymm2 = [875770417,943142453,842084409,909456435,809056311,875770417,943142453,842084409]
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm1, %k0
-; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [858927408,926299444,825243960,892613426,959985462,858927408,926299444,825243960]
-; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
-; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX-NEXT:    sete %al
-; X64-MIC-AVX-NEXT:    vzeroupper
-; X64-MIC-AVX-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 63) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length64(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length64:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $64, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 64) nounwind
-  ret i32 %m
-}
-
-define i1 @length64_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE-LABEL: length64_eq:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $64, %edx
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    setne %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length64_eq:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
-; X64-AVX1-NEXT:    vxorps 32(%rsi), %ymm1, %ymm1
-; X64-AVX1-NEXT:    vxorps (%rsi), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    setne %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length64_eq:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-AVX2-NEXT:    vpxor 32(%rsi), %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512-LABEL: length64_eq:
-; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512-NEXT:    vpcmpneqd (%rsi), %zmm0, %k0
-; X64-AVX512-NEXT:    kortestw %k0, %k0
-; X64-AVX512-NEXT:    setne %al
-; X64-AVX512-NEXT:    vzeroupper
-; X64-AVX512-NEXT:    retq
-;
-; X64-MIC-AVX2-LABEL: length64_eq:
-; X64-MIC-AVX2:       # %bb.0:
-; X64-MIC-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-MIC-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-MIC-AVX2-NEXT:    vmovdqu (%rsi), %ymm2
-; X64-MIC-AVX2-NEXT:    vmovdqu 32(%rsi), %ymm3
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm3, %zmm1, %k0
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm2, %zmm0, %k1
-; X64-MIC-AVX2-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX2-NEXT:    setne %al
-; X64-MIC-AVX2-NEXT:    vzeroupper
-; X64-MIC-AVX2-NEXT:    retq
-;
-; X64-MIC-AVX512F-LABEL: length64_eq:
-; X64-MIC-AVX512F:       # %bb.0:
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd (%rsi), %zmm0, %k0
-; X64-MIC-AVX512F-NEXT:    kortestw %k0, %k0
-; X64-MIC-AVX512F-NEXT:    setne %al
-; X64-MIC-AVX512F-NEXT:    vzeroupper
-; X64-MIC-AVX512F-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length64_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $64, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length64_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $64, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_eq_const(ptr %X) nounwind {
-; X64-SSE-LABEL: length64_eq_const:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $.L.str, %esi
-; X64-SSE-NEXT:    movl $64, %edx
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    sete %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length64_eq_const:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT:    vmovups 32(%rdi), %ymm1
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; X64-AVX1-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX1-NEXT:    sete %al
-; X64-AVX1-NEXT:    vzeroupper
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length64_eq_const:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; X64-AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vptest %ymm0, %ymm0
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512-LABEL: length64_eq_const:
-; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k0
-; X64-AVX512-NEXT:    kortestw %k0, %k0
-; X64-AVX512-NEXT:    sete %al
-; X64-AVX512-NEXT:    vzeroupper
-; X64-AVX512-NEXT:    retq
-;
-; X64-MIC-AVX2-LABEL: length64_eq_const:
-; X64-MIC-AVX2:       # %bb.0:
-; X64-MIC-AVX2-NEXT:    vmovdqu (%rdi), %ymm0
-; X64-MIC-AVX2-NEXT:    vmovdqu 32(%rdi), %ymm1
-; X64-MIC-AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [892613426,959985462,858927408,926299444,825243960,892613426,959985462,858927408]
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm2, %zmm1, %k0
-; X64-MIC-AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [858927408,926299444,825243960,892613426,959985462,858927408,926299444,825243960]
-; X64-MIC-AVX2-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
-; X64-MIC-AVX2-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX2-NEXT:    sete %al
-; X64-MIC-AVX2-NEXT:    vzeroupper
-; X64-MIC-AVX2-NEXT:    retq
-;
-; X64-MIC-AVX512F-LABEL: length64_eq_const:
-; X64-MIC-AVX512F:       # %bb.0:
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k0
-; X64-MIC-AVX512F-NEXT:    kortestw %k0, %k0
-; X64-MIC-AVX512F-NEXT:    sete %al
-; X64-MIC-AVX512F-NEXT:    vzeroupper
-; X64-MIC-AVX512F-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 64) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length96(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length96:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $96, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 96) nounwind
-  ret i32 %m
-}
-
-define i1 @length96_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE-LABEL: length96_eq:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $96, %edx
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    setne %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length96_eq:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    pushq %rax
-; X64-AVX1-NEXT:    movl $96, %edx
-; X64-AVX1-NEXT:    callq memcmp
-; X64-AVX1-NEXT:    testl %eax, %eax
-; X64-AVX1-NEXT:    setne %al
-; X64-AVX1-NEXT:    popq %rcx
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length96_eq:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    pushq %rax
-; X64-AVX2-NEXT:    movl $96, %edx
-; X64-AVX2-NEXT:    callq memcmp
-; X64-AVX2-NEXT:    testl %eax, %eax
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    popq %rcx
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512BW-LABEL: length96_eq:
-; X64-AVX512BW:       # %bb.0:
-; X64-AVX512BW-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512BW-NEXT:    vmovdqu 64(%rdi), %ymm1
-; X64-AVX512BW-NEXT:    vmovdqu 64(%rsi), %ymm2
-; X64-AVX512BW-NEXT:    vpcmpneqb (%rsi), %zmm0, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb %zmm2, %zmm1, %k1
-; X64-AVX512BW-NEXT:    kortestq %k1, %k0
-; X64-AVX512BW-NEXT:    setne %al
-; X64-AVX512BW-NEXT:    vzeroupper
-; X64-AVX512BW-NEXT:    retq
-;
-; X64-AVX512F-LABEL: length96_eq:
-; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512F-NEXT:    vmovdqu 64(%rdi), %ymm1
-; X64-AVX512F-NEXT:    vmovdqu 64(%rsi), %ymm2
-; X64-AVX512F-NEXT:    vpcmpneqd (%rsi), %zmm0, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1
-; X64-AVX512F-NEXT:    kortestw %k1, %k0
-; X64-AVX512F-NEXT:    setne %al
-; X64-AVX512F-NEXT:    vzeroupper
-; X64-AVX512F-NEXT:    retq
-;
-; X64-MIC-AVX2-LABEL: length96_eq:
-; X64-MIC-AVX2:       # %bb.0:
-; X64-MIC-AVX2-NEXT:    pushq %rax
-; X64-MIC-AVX2-NEXT:    movl $96, %edx
-; X64-MIC-AVX2-NEXT:    callq memcmp
-; X64-MIC-AVX2-NEXT:    testl %eax, %eax
-; X64-MIC-AVX2-NEXT:    setne %al
-; X64-MIC-AVX2-NEXT:    popq %rcx
-; X64-MIC-AVX2-NEXT:    retq
-;
-; X64-MIC-AVX512F-LABEL: length96_eq:
-; X64-MIC-AVX512F:       # %bb.0:
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-MIC-AVX512F-NEXT:    vmovdqu 64(%rdi), %ymm1
-; X64-MIC-AVX512F-NEXT:    vmovdqu 64(%rsi), %ymm2
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd (%rsi), %zmm0, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1
-; X64-MIC-AVX512F-NEXT:    kortestw %k1, %k0
-; X64-MIC-AVX512F-NEXT:    setne %al
-; X64-MIC-AVX512F-NEXT:    vzeroupper
-; X64-MIC-AVX512F-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length96_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length96_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $96, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length96_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length96_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $96, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length96_eq_const(ptr %X) nounwind {
-; X64-SSE-LABEL: length96_eq_const:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $.L.str, %esi
-; X64-SSE-NEXT:    movl $96, %edx
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    sete %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length96_eq_const:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    pushq %rax
-; X64-AVX1-NEXT:    movl $.L.str, %esi
-; X64-AVX1-NEXT:    movl $96, %edx
-; X64-AVX1-NEXT:    callq memcmp
-; X64-AVX1-NEXT:    testl %eax, %eax
-; X64-AVX1-NEXT:    sete %al
-; X64-AVX1-NEXT:    popq %rcx
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length96_eq_const:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    pushq %rax
-; X64-AVX2-NEXT:    movl $.L.str, %esi
-; X64-AVX2-NEXT:    movl $96, %edx
-; X64-AVX2-NEXT:    callq memcmp
-; X64-AVX2-NEXT:    testl %eax, %eax
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    popq %rcx
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512BW-LABEL: length96_eq_const:
-; X64-AVX512BW:       # %bb.0:
-; X64-AVX512BW-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512BW-NEXT:    vmovdqu 64(%rdi), %ymm1
-; X64-AVX512BW-NEXT:    vpcmpneqb .L.str(%rip), %zmm0, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %k1
-; X64-AVX512BW-NEXT:    kortestq %k1, %k0
-; X64-AVX512BW-NEXT:    sete %al
-; X64-AVX512BW-NEXT:    vzeroupper
-; X64-AVX512BW-NEXT:    retq
-;
-; X64-AVX512F-LABEL: length96_eq_const:
-; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512F-NEXT:    vmovdqu 64(%rdi), %ymm1
-; X64-AVX512F-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %k1
-; X64-AVX512F-NEXT:    kortestw %k1, %k0
-; X64-AVX512F-NEXT:    sete %al
-; X64-AVX512F-NEXT:    vzeroupper
-; X64-AVX512F-NEXT:    retq
-;
-; X64-MIC-AVX2-LABEL: length96_eq_const:
-; X64-MIC-AVX2:       # %bb.0:
-; X64-MIC-AVX2-NEXT:    pushq %rax
-; X64-MIC-AVX2-NEXT:    movl $.L.str, %esi
-; X64-MIC-AVX2-NEXT:    movl $96, %edx
-; X64-MIC-AVX2-NEXT:    callq memcmp
-; X64-MIC-AVX2-NEXT:    testl %eax, %eax
-; X64-MIC-AVX2-NEXT:    sete %al
-; X64-MIC-AVX2-NEXT:    popq %rcx
-; X64-MIC-AVX2-NEXT:    retq
-;
-; X64-MIC-AVX512F-LABEL: length96_eq_const:
-; X64-MIC-AVX512F:       # %bb.0:
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-MIC-AVX512F-NEXT:    vmovdqu 64(%rdi), %ymm1
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %k1
-; X64-MIC-AVX512F-NEXT:    kortestw %k1, %k0
-; X64-MIC-AVX512F-NEXT:    sete %al
-; X64-MIC-AVX512F-NEXT:    vzeroupper
-; X64-MIC-AVX512F-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 96) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length127(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length127:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $127, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 127) nounwind
-  ret i32 %m
-}
-
-define i1 @length127_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE-LABEL: length127_eq:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $127, %edx
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    setne %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length127_eq:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    pushq %rax
-; X64-AVX1-NEXT:    movl $127, %edx
-; X64-AVX1-NEXT:    callq memcmp
-; X64-AVX1-NEXT:    testl %eax, %eax
-; X64-AVX1-NEXT:    setne %al
-; X64-AVX1-NEXT:    popq %rcx
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length127_eq:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    pushq %rax
-; X64-AVX2-NEXT:    movl $127, %edx
-; X64-AVX2-NEXT:    callq memcmp
-; X64-AVX2-NEXT:    testl %eax, %eax
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    popq %rcx
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512BW-LABEL: length127_eq:
-; X64-AVX512BW:       # %bb.0:
-; X64-AVX512BW-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512BW-NEXT:    vmovdqu64 63(%rdi), %zmm1
-; X64-AVX512BW-NEXT:    vpcmpneqb 63(%rsi), %zmm1, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb (%rsi), %zmm0, %k1
-; X64-AVX512BW-NEXT:    kortestq %k0, %k1
-; X64-AVX512BW-NEXT:    setne %al
-; X64-AVX512BW-NEXT:    vzeroupper
-; X64-AVX512BW-NEXT:    retq
-;
-; X64-AVX512F-LABEL: length127_eq:
-; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512F-NEXT:    vmovdqu64 63(%rdi), %zmm1
-; X64-AVX512F-NEXT:    vpcmpneqd 63(%rsi), %zmm1, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd (%rsi), %zmm0, %k1
-; X64-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-AVX512F-NEXT:    setne %al
-; X64-AVX512F-NEXT:    vzeroupper
-; X64-AVX512F-NEXT:    retq
-;
-; X64-MIC-AVX2-LABEL: length127_eq:
-; X64-MIC-AVX2:       # %bb.0:
-; X64-MIC-AVX2-NEXT:    pushq %rax
-; X64-MIC-AVX2-NEXT:    movl $127, %edx
-; X64-MIC-AVX2-NEXT:    callq memcmp
-; X64-MIC-AVX2-NEXT:    testl %eax, %eax
-; X64-MIC-AVX2-NEXT:    setne %al
-; X64-MIC-AVX2-NEXT:    popq %rcx
-; X64-MIC-AVX2-NEXT:    retq
-;
-; X64-MIC-AVX512F-LABEL: length127_eq:
-; X64-MIC-AVX512F:       # %bb.0:
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 63(%rdi), %zmm1
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd 63(%rsi), %zmm1, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd (%rsi), %zmm0, %k1
-; X64-MIC-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX512F-NEXT:    setne %al
-; X64-MIC-AVX512F-NEXT:    vzeroupper
-; X64-MIC-AVX512F-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length127_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length127_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $127, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length127_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length127_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $127, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length127_eq_const(ptr %X) nounwind {
-; X64-SSE-LABEL: length127_eq_const:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $.L.str, %esi
-; X64-SSE-NEXT:    movl $127, %edx
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    sete %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length127_eq_const:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    pushq %rax
-; X64-AVX1-NEXT:    movl $.L.str, %esi
-; X64-AVX1-NEXT:    movl $127, %edx
-; X64-AVX1-NEXT:    callq memcmp
-; X64-AVX1-NEXT:    testl %eax, %eax
-; X64-AVX1-NEXT:    sete %al
-; X64-AVX1-NEXT:    popq %rcx
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length127_eq_const:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    pushq %rax
-; X64-AVX2-NEXT:    movl $.L.str, %esi
-; X64-AVX2-NEXT:    movl $127, %edx
-; X64-AVX2-NEXT:    callq memcmp
-; X64-AVX2-NEXT:    testl %eax, %eax
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    popq %rcx
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512BW-LABEL: length127_eq_const:
-; X64-AVX512BW:       # %bb.0:
-; X64-AVX512BW-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512BW-NEXT:    vmovdqu64 63(%rdi), %zmm1
-; X64-AVX512BW-NEXT:    vpcmpneqb .L.str+63(%rip), %zmm1, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb .L.str(%rip), %zmm0, %k1
-; X64-AVX512BW-NEXT:    kortestq %k0, %k1
-; X64-AVX512BW-NEXT:    sete %al
-; X64-AVX512BW-NEXT:    vzeroupper
-; X64-AVX512BW-NEXT:    retq
-;
-; X64-AVX512F-LABEL: length127_eq_const:
-; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512F-NEXT:    vmovdqu64 63(%rdi), %zmm1
-; X64-AVX512F-NEXT:    vpcmpneqd .L.str+63(%rip), %zmm1, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k1
-; X64-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-AVX512F-NEXT:    sete %al
-; X64-AVX512F-NEXT:    vzeroupper
-; X64-AVX512F-NEXT:    retq
-;
-; X64-MIC-AVX2-LABEL: length127_eq_const:
-; X64-MIC-AVX2:       # %bb.0:
-; X64-MIC-AVX2-NEXT:    pushq %rax
-; X64-MIC-AVX2-NEXT:    movl $.L.str, %esi
-; X64-MIC-AVX2-NEXT:    movl $127, %edx
-; X64-MIC-AVX2-NEXT:    callq memcmp
-; X64-MIC-AVX2-NEXT:    testl %eax, %eax
-; X64-MIC-AVX2-NEXT:    sete %al
-; X64-MIC-AVX2-NEXT:    popq %rcx
-; X64-MIC-AVX2-NEXT:    retq
-;
-; X64-MIC-AVX512F-LABEL: length127_eq_const:
-; X64-MIC-AVX512F:       # %bb.0:
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 63(%rdi), %zmm1
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str+63(%rip), %zmm1, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k1
-; X64-MIC-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX512F-NEXT:    sete %al
-; X64-MIC-AVX512F-NEXT:    vzeroupper
-; X64-MIC-AVX512F-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 127) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length128(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length128:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $128, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 128) nounwind
-  ret i32 %m
-}
-
-define i1 @length128_eq(ptr %x, ptr %y) nounwind {
-; X64-SSE-LABEL: length128_eq:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $128, %edx
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    setne %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length128_eq:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    pushq %rax
-; X64-AVX1-NEXT:    movl $128, %edx
-; X64-AVX1-NEXT:    callq memcmp
-; X64-AVX1-NEXT:    testl %eax, %eax
-; X64-AVX1-NEXT:    setne %al
-; X64-AVX1-NEXT:    popq %rcx
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length128_eq:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    pushq %rax
-; X64-AVX2-NEXT:    movl $128, %edx
-; X64-AVX2-NEXT:    callq memcmp
-; X64-AVX2-NEXT:    testl %eax, %eax
-; X64-AVX2-NEXT:    setne %al
-; X64-AVX2-NEXT:    popq %rcx
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512BW-LABEL: length128_eq:
-; X64-AVX512BW:       # %bb.0:
-; X64-AVX512BW-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512BW-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-AVX512BW-NEXT:    vpcmpneqb 64(%rsi), %zmm1, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb (%rsi), %zmm0, %k1
-; X64-AVX512BW-NEXT:    kortestq %k0, %k1
-; X64-AVX512BW-NEXT:    setne %al
-; X64-AVX512BW-NEXT:    vzeroupper
-; X64-AVX512BW-NEXT:    retq
-;
-; X64-AVX512F-LABEL: length128_eq:
-; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-AVX512F-NEXT:    vpcmpneqd 64(%rsi), %zmm1, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd (%rsi), %zmm0, %k1
-; X64-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-AVX512F-NEXT:    setne %al
-; X64-AVX512F-NEXT:    vzeroupper
-; X64-AVX512F-NEXT:    retq
-;
-; X64-MIC-AVX2-LABEL: length128_eq:
-; X64-MIC-AVX2:       # %bb.0:
-; X64-MIC-AVX2-NEXT:    pushq %rax
-; X64-MIC-AVX2-NEXT:    movl $128, %edx
-; X64-MIC-AVX2-NEXT:    callq memcmp
-; X64-MIC-AVX2-NEXT:    testl %eax, %eax
-; X64-MIC-AVX2-NEXT:    setne %al
-; X64-MIC-AVX2-NEXT:    popq %rcx
-; X64-MIC-AVX2-NEXT:    retq
-;
-; X64-MIC-AVX512F-LABEL: length128_eq:
-; X64-MIC-AVX512F:       # %bb.0:
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd 64(%rsi), %zmm1, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd (%rsi), %zmm0, %k1
-; X64-MIC-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX512F-NEXT:    setne %al
-; X64-MIC-AVX512F-NEXT:    vzeroupper
-; X64-MIC-AVX512F-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length128_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length128_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $128, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length128_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length128_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $128, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length128_eq_const(ptr %X) nounwind {
-; X64-SSE-LABEL: length128_eq_const:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    pushq %rax
-; X64-SSE-NEXT:    movl $.L.str, %esi
-; X64-SSE-NEXT:    movl $128, %edx
-; X64-SSE-NEXT:    callq memcmp
-; X64-SSE-NEXT:    testl %eax, %eax
-; X64-SSE-NEXT:    sete %al
-; X64-SSE-NEXT:    popq %rcx
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: length128_eq_const:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    pushq %rax
-; X64-AVX1-NEXT:    movl $.L.str, %esi
-; X64-AVX1-NEXT:    movl $128, %edx
-; X64-AVX1-NEXT:    callq memcmp
-; X64-AVX1-NEXT:    testl %eax, %eax
-; X64-AVX1-NEXT:    sete %al
-; X64-AVX1-NEXT:    popq %rcx
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: length128_eq_const:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    pushq %rax
-; X64-AVX2-NEXT:    movl $.L.str, %esi
-; X64-AVX2-NEXT:    movl $128, %edx
-; X64-AVX2-NEXT:    callq memcmp
-; X64-AVX2-NEXT:    testl %eax, %eax
-; X64-AVX2-NEXT:    sete %al
-; X64-AVX2-NEXT:    popq %rcx
-; X64-AVX2-NEXT:    retq
-;
-; X64-AVX512BW-LABEL: length128_eq_const:
-; X64-AVX512BW:       # %bb.0:
-; X64-AVX512BW-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512BW-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-AVX512BW-NEXT:    vpcmpneqb .L.str+64(%rip), %zmm1, %k0
-; X64-AVX512BW-NEXT:    vpcmpneqb .L.str(%rip), %zmm0, %k1
-; X64-AVX512BW-NEXT:    kortestq %k0, %k1
-; X64-AVX512BW-NEXT:    sete %al
-; X64-AVX512BW-NEXT:    vzeroupper
-; X64-AVX512BW-NEXT:    retq
-;
-; X64-AVX512F-LABEL: length128_eq_const:
-; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-AVX512F-NEXT:    vpcmpneqd .L.str+64(%rip), %zmm1, %k0
-; X64-AVX512F-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k1
-; X64-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-AVX512F-NEXT:    sete %al
-; X64-AVX512F-NEXT:    vzeroupper
-; X64-AVX512F-NEXT:    retq
-;
-; X64-MIC-AVX2-LABEL: length128_eq_const:
-; X64-MIC-AVX2:       # %bb.0:
-; X64-MIC-AVX2-NEXT:    pushq %rax
-; X64-MIC-AVX2-NEXT:    movl $.L.str, %esi
-; X64-MIC-AVX2-NEXT:    movl $128, %edx
-; X64-MIC-AVX2-NEXT:    callq memcmp
-; X64-MIC-AVX2-NEXT:    testl %eax, %eax
-; X64-MIC-AVX2-NEXT:    sete %al
-; X64-MIC-AVX2-NEXT:    popq %rcx
-; X64-MIC-AVX2-NEXT:    retq
-;
-; X64-MIC-AVX512F-LABEL: length128_eq_const:
-; X64-MIC-AVX512F:       # %bb.0:
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
-; X64-MIC-AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str+64(%rip), %zmm1, %k0
-; X64-MIC-AVX512F-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k1
-; X64-MIC-AVX512F-NEXT:    kortestw %k0, %k1
-; X64-MIC-AVX512F-NEXT:    sete %al
-; X64-MIC-AVX512F-NEXT:    vzeroupper
-; X64-MIC-AVX512F-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 128) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length192(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length192:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $192, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 192) nounwind
-  ret i32 %m
-}
-
-define i1 @length192_eq(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length192_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $192, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length192_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length192_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $192, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length192_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length192_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $192, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length192_eq_const(ptr %X) nounwind {
-; X64-LABEL: length192_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $.L.str, %esi
-; X64-NEXT:    movl $192, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 192) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length255(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length255:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $255, %edx
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 255) nounwind
-  ret i32 %m
-}
-
-define i1 @length255_eq(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length255_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $255, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length255_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length255_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $255, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length255_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length255_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $255, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length255_eq_const(ptr %X) nounwind {
-; X64-LABEL: length255_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $.L.str, %esi
-; X64-NEXT:    movl $255, %edx
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 255) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length256(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length256:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $256, %edx # imm = 0x100
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 256) nounwind
-  ret i32 %m
-}
-
-define i1 @length256_eq(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length256_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $256, %edx # imm = 0x100
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length256_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length256_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $256, %edx # imm = 0x100
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length256_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length256_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $256, %edx # imm = 0x100
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length256_eq_const(ptr %X) nounwind {
-; X64-LABEL: length256_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $.L.str, %esi
-; X64-NEXT:    movl $256, %edx # imm = 0x100
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 256) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length384(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length384:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $384, %edx # imm = 0x180
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 384) nounwind
-  ret i32 %m
-}
-
-define i1 @length384_eq(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length384_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $384, %edx # imm = 0x180
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length384_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length384_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $384, %edx # imm = 0x180
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length384_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length384_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $384, %edx # imm = 0x180
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length384_eq_const(ptr %X) nounwind {
-; X64-LABEL: length384_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $.L.str, %esi
-; X64-NEXT:    movl $384, %edx # imm = 0x180
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 384) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length511(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length511:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $511, %edx # imm = 0x1FF
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 511) nounwind
-  ret i32 %m
-}
-
-define i1 @length511_eq(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length511_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $511, %edx # imm = 0x1FF
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length511_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length511_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $511, %edx # imm = 0x1FF
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length511_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length511_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $511, %edx # imm = 0x1FF
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length511_eq_const(ptr %X) nounwind {
-; X64-LABEL: length511_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $.L.str, %esi
-; X64-NEXT:    movl $511, %edx # imm = 0x1FF
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 511) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length512(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: length512:
-; X64:       # %bb.0:
-; X64-NEXT:    movl $512, %edx # imm = 0x200
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 512) nounwind
-  ret i32 %m
-}
-
-define i1 @length512_eq(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length512_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $512, %edx # imm = 0x200
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setne %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length512_lt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length512_lt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $512, %edx # imm = 0x200
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    shrl $31, %eax
-; X64-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length512_gt(ptr %x, ptr %y) nounwind {
-; X64-LABEL: length512_gt:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $512, %edx # imm = 0x200
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    setg %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length512_eq_const(ptr %X) nounwind {
-; X64-LABEL: length512_eq_const:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movl $.L.str, %esi
-; X64-NEXT:    movl $512, %edx # imm = 0x200
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 512) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-; This checks that we do not do stupid things with huge sizes.
-define i32 @huge_length(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: huge_length:
-; X64:       # %bb.0:
-; X64-NEXT:    movabsq $9223372036854775807, %rdx # imm = 0x7FFFFFFFFFFFFFFF
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9223372036854775807) nounwind
-  ret i32 %m
-}
-
-define i1 @huge_length_eq(ptr %X, ptr %Y) nounwind {
-; X64-LABEL: huge_length_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movabsq $9223372036854775807, %rdx # imm = 0x7FFFFFFFFFFFFFFF
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9223372036854775807) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-; This checks non-constant sizes.
-define i32 @nonconst_length(ptr %X, ptr %Y, i64 %size) nounwind {
-; X64-LABEL: nonconst_length:
-; X64:       # %bb.0:
-; X64-NEXT:    jmp memcmp # TAILCALL
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 %size) nounwind
-  ret i32 %m
-}
-
-define i1 @nonconst_length_eq(ptr %X, ptr %Y, i64 %size) nounwind {
-; X64-LABEL: nonconst_length_eq:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    callq memcmp
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    sete %al
-; X64-NEXT:    popq %rcx
-; X64-NEXT:    retq
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 %size) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll
index fb8d2335b34106..e9eddf35f7403c 100644
--- a/llvm/test/CodeGen/X86/opt-pipeline.ll
+++ b/llvm/test/CodeGen/X86/opt-pipeline.ll
@@ -19,8 +19,8 @@
 ; CHECK-NEXT: Type-Based Alias Analysis
 ; CHECK-NEXT: Scoped NoAlias Alias Analysis
 ; CHECK-NEXT: Assumption Cache Tracker
-; CHECK-NEXT: Profile summary info
 ; CHECK-NEXT: Create Garbage Collector Module Metadata
+; CHECK-NEXT: Profile summary info
 ; CHECK-NEXT: Machine Branch Probability Analysis
 ; CHECK-NEXT: Default Regalloc Eviction Advisor
 ; CHECK-NEXT: Default Regalloc Priority Advisor
@@ -42,13 +42,6 @@
 ; CHECK-NEXT:         Canonicalize Freeze Instructions in Loops
 ; CHECK-NEXT:         Induction Variable Users
 ; CHECK-NEXT:         Loop Strength Reduction
-; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
-; CHECK-NEXT:         Function Alias Analysis Results
-; CHECK-NEXT:       Merge contiguous icmps into a memcmp
-; CHECK-NEXT:       Natural Loop Information
-; CHECK-NEXT:       Lazy Branch Probability Analysis
-; CHECK-NEXT:       Lazy Block Frequency Analysis
-; CHECK-NEXT:       Expand memcmp() to load/stores
 ; CHECK-NEXT:       Lower Garbage Collection Instructions
 ; CHECK-NEXT:       Shadow Stack GC Lowering
 ; CHECK-NEXT:       Lower constant intrinsics
diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
index ecdb5a5e010d92..ce13b2eb52a7ef 100644
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -142,10 +142,12 @@
 ; CHECK-O2-NEXT: Running pass: OpenMPOptCGSCCPass on (foo)
 ; CHECK-O3-NEXT: Running pass: OpenMPOptCGSCCPass on (foo)
 ; CHECK-EP-CGSCC-LATE-NEXT: Running pass: NoOpCGSCCPass
+; CHECK-O-NEXT: Running pass: MergeICmpsPass on foo
+; CHECK-O-NEXT: Running analysis: AAManager on foo
+; CHECK-O-NEXT: Running pass: ExpandMemCmpPass on foo
 ; CHECK-O-NEXT: Running pass: SROAPass
 ; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running analysis: MemorySSAAnalysis
-; CHECK-O-NEXT: Running analysis: AAManager
 ; CHECK-O23SZ-NEXT: Running pass: SpeculativeExecutionPass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
index 064362eabbf839..d6f09a85953c14 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
@@ -81,10 +81,12 @@
 ; CHECK-O3-NEXT: Running pass: ArgumentPromotionPass
 ; CHECK-O2-NEXT: Running pass: OpenMPOptCGSCCPass on (foo)
 ; CHECK-O3-NEXT: Running pass: OpenMPOptCGSCCPass on (foo)
+; CHECK-O-NEXT: Running pass: MergeICmpsPass on foo
+; CHECK-O-NEXT: Running analysis: AAManager on foo
+; CHECK-O-NEXT: Running pass: ExpandMemCmpPass on foo
 ; CHECK-O-NEXT: Running pass: SROAPass
 ; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running analysis: MemorySSAAnalysis
-; CHECK-O-NEXT: Running analysis: AAManager
 ; CHECK-O23SZ-NEXT: Running pass: SpeculativeExecutionPass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
index 19a44867e434ac..cc3939c5bdcf7b 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
@@ -69,10 +69,12 @@
 ; CHECK-O3-NEXT: Running pass: ArgumentPromotionPass
 ; CHECK-O2-NEXT: Running pass: OpenMPOptCGSCCPass
 ; CHECK-O3-NEXT: Running pass: OpenMPOptCGSCCPass
+; CHECK-O-NEXT: Running pass: MergeICmpsPass on foo
+; CHECK-O-NEXT: Running analysis: AAManager on foo
+; CHECK-O-NEXT: Running pass: ExpandMemCmpPass on foo
 ; CHECK-O-NEXT: Running pass: SROAPass
 ; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running analysis: MemorySSAAnalysis
-; CHECK-O-NEXT: Running analysis: AAManager
 ; CHECK-O23SZ-NEXT: Running pass: SpeculativeExecutionPass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
index ac80a31d8fd4bc..bf354c91d15f37 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
@@ -77,10 +77,12 @@
 ; CHECK-O3-NEXT: Running pass: ArgumentPromotionPass
 ; CHECK-O2-NEXT: Running pass: OpenMPOptCGSCCPass
 ; CHECK-O3-NEXT: Running pass: OpenMPOptCGSCCPass
+; CHECK-O-NEXT: Running pass: MergeICmpsPass on foo
+; CHECK-O-NEXT: Running analysis: AAManager on foo
+; CHECK-O-NEXT: Running pass: ExpandMemCmpPass on foo
 ; CHECK-O-NEXT: Running pass: SROAPass
 ; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running analysis: MemorySSAAnalysis
-; CHECK-O-NEXT: Running analysis: AAManager
 ; CHECK-O23SZ-NEXT: Running pass: SpeculativeExecutionPass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
index 6486639e07b49c..9c5f9fd281ee7c 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
@@ -112,10 +112,12 @@
 ; CHECK-O3-NEXT: Running pass: ArgumentPromotionPass
 ; CHECK-O2-NEXT: Running pass: OpenMPOptCGSCCPass on (foo)
 ; CHECK-O3-NEXT: Running pass: OpenMPOptCGSCCPass on (foo)
+; CHECK-O-NEXT: Running pass: MergeICmpsPass on foo
+; CHECK-O-NEXT: Running analysis: AAManager on foo
+; CHECK-O-NEXT: Running pass: ExpandMemCmpPass on foo
 ; CHECK-O-NEXT: Running pass: SROAPass
 ; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running analysis: MemorySSAAnalysis
-; CHECK-O-NEXT: Running analysis: AAManager
 ; CHECK-O23SZ-NEXT: Running pass: SpeculativeExecutionPass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
index 09f9f0f48baddb..92ab5b6bbc74ad 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
@@ -102,17 +102,23 @@
 ; CHECK-O3-NEXT: Running pass: ArgumentPromotionPass
 ; CHECK-O2-NEXT: Running pass: OpenMPOptCGSCCPass
 ; CHECK-O3-NEXT: Running pass: OpenMPOptCGSCCPass
+; CHECK-O-NEXT: Running pass: MergeICmpsPass on foo
+; CHECK-O-NEXT: Running analysis: TargetIRAnalysis on foo
+; CHECK-O-NEXT: Running analysis: AAManager on foo
+; CHECK-O-NEXT: Running analysis: BasicAA on foo
+; CHECK-O-NEXT: Running analysis: AssumptionAnalysis on foo
+; CHECK-O-NEXT: Running analysis: DominatorTreeAnalysis on foo
+; CHECK-O-NEXT: Running analysis: ScopedNoAliasAA on foo
+; CHECK-O-NEXT: Running analysis: TypeBasedAA on foo
+; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
+; CHECK-O-NEXT: Running pass: ExpandMemCmpPass on foo
+; CHECK-O-NEXT: Running analysis: BlockFrequencyAnalysis on foo
+; CHECK-O-NEXT: Running analysis: BranchProbabilityAnalysis on foo
+; CHECK-O-NEXT: Running analysis: LoopAnalysis on foo
+; CHECK-O-NEXT: Running analysis: PostDominatorTreeAnalysis on foo
 ; CHECK-O-NEXT: Running pass: SROAPass
-; CHECK-O-NEXT: Running analysis: DominatorTreeAnalysis
-; CHECK-O-NEXT: Running analysis: AssumptionAnalysis
-; CHECK-O-NEXT: Running analysis: TargetIRAnalysis
 ; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running analysis: MemorySSAAnalysis
-; CHECK-O-NEXT: Running analysis: AAManager
-; CHECK-O-NEXT: Running analysis: BasicAA
-; CHECK-O-NEXT: Running analysis: ScopedNoAliasAA
-; CHECK-O-NEXT: Running analysis: TypeBasedAA
-; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
 ; CHECK-O23SZ-NEXT: Running pass: SpeculativeExecutionPass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
@@ -120,10 +126,6 @@
 ; CHECK-O23SZ-NEXT: Invalidating analysis: LazyValueAnalysis
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
-; CHECK-O-NEXT: Running analysis: BlockFrequencyAnalysis on foo
-; CHECK-O-NEXT: Running analysis: BranchProbabilityAnalysis on foo
-; CHECK-O-NEXT: Running analysis: LoopAnalysis on foo
-; CHECK-O-NEXT: Running analysis: PostDominatorTreeAnalysis on foo
 ; CHECK-O23SZ-NEXT: Running pass: AggressiveInstCombinePass
 ; CHECK-O1-NEXT: Running pass: LibCallsShrinkWrapPass
 ; CHECK-O2-NEXT: Running pass: LibCallsShrinkWrapPass
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
index 47bdbfd2d357d4..b565e80ac05e90 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
@@ -81,10 +81,12 @@
 ; CHECK-O3-NEXT: Running pass: ArgumentPromotionPass
 ; CHECK-O2-NEXT: Running pass: OpenMPOptCGSCCPass
 ; CHECK-O3-NEXT: Running pass: OpenMPOptCGSCCPass
+; CHECK-O-NEXT: Running pass: MergeICmpsPass on foo
+; CHECK-O-NEXT: Running analysis: AAManager on foo
+; CHECK-O-NEXT: Running pass: ExpandMemCmpPass on foo
 ; CHECK-O-NEXT: Running pass: SROAPass
 ; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running analysis: MemorySSAAnalysis
-; CHECK-O-NEXT: Running analysis: AAManager
 ; CHECK-O23SZ-NEXT: Running pass: SpeculativeExecutionPass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
diff --git a/llvm/test/Transforms/ExpandMemCmp/AArch64/bcmp.ll b/llvm/test/Transforms/ExpandMemCmp/AArch64/bcmp.ll
new file mode 100644
index 00000000000000..18141e72007f7a
--- /dev/null
+++ b/llvm/test/Transforms/ExpandMemCmp/AArch64/bcmp.ll
@@ -0,0 +1,751 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -passes=expand-memcmp  < %s -mtriple=aarch64-unknown-unknown | FileCheck %s
+
+declare i32 @bcmp(ptr, ptr, i64)
+
+define i1 @bcmp0(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp0(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    ret i1 true
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 0)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+define i1 @bcmp1(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp1(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i8 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 1)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+define i1 @bcmp2(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp2(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 2)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+; or (and (xor a, b), C1), (and (xor c, d), C2)
+define i1 @bcmp3(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp3(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 2
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 2
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; CHECK-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP13]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 3)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+define i1 @bcmp4(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp4(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 4)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+; or (xor a, b), (and (xor c, d), C2)
+define i1 @bcmp5(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp5(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; CHECK-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP13]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 5)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+; or (xor a, b), (and (xor c, d), C2)
+define i1 @bcmp6(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp6(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i32
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i32
+; CHECK-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP13]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 6)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+; or (xor a, b), (xor c, d)
+define i1 @bcmp7(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp7(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 3
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 3
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP11]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 7)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+define i1 @bcmp8(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp8(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 8)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+; or (xor a, b), (and (xor c, d), C2)
+define i1 @bcmp9(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp9(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP13]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 9)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+define i1 @bcmp10(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp10(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP13]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 10)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+define i1 @bcmp11(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp11(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 3
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 3
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP11]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 11)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+define i1 @bcmp12(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp12(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP13]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 12)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+define i1 @bcmp13(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp13(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 5
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 5
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP11]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 13)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+define i1 @bcmp14(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp14(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 6
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 6
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP11]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 14)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+define i1 @bcmp15(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp15(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 7
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 7
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP11]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 15)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+define i1 @bcmp16(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp16(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP11]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 16)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+define i1 @bcmp20(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp20(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[A]], i64 16
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 16
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP11]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP12]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = xor i64 [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or i64 [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne i64 [[TMP17]], 0
+; CHECK-NEXT:    [[TMP19:%.*]] = zext i1 [[TMP18]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP19]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 20)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+define i1 @bcmp24(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp24(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[A]], i64 16
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 16
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or i64 [[TMP14]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP15]], 0
+; CHECK-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP17]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 24)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+define i1 @bcmp28(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp28(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[A]], i64 16
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 16
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[A]], i64 24
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[B]], i64 24
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = zext i32 [[TMP16]] to i64
+; CHECK-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP17]] to i64
+; CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or i64 [[TMP13]], [[TMP20]]
+; CHECK-NEXT:    [[TMP23:%.*]] = or i64 [[TMP21]], [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp ne i64 [[TMP23]], 0
+; CHECK-NEXT:    [[TMP25:%.*]] = zext i1 [[TMP24]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP25]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 28)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+define i1 @bcmp33(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp33(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[A]], i64 16
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 16
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[A]], i64 24
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[B]], i64 24
+; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = xor i64 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[A]], i64 32
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 32
+; CHECK-NEXT:    [[TMP21:%.*]] = load i8, ptr [[TMP19]], align 1
+; CHECK-NEXT:    [[TMP22:%.*]] = load i8, ptr [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP23:%.*]] = zext i8 [[TMP21]] to i64
+; CHECK-NEXT:    [[TMP24:%.*]] = zext i8 [[TMP22]] to i64
+; CHECK-NEXT:    [[TMP25:%.*]] = xor i64 [[TMP23]], [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP27:%.*]] = or i64 [[TMP13]], [[TMP18]]
+; CHECK-NEXT:    [[TMP28:%.*]] = or i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    [[TMP29:%.*]] = or i64 [[TMP28]], [[TMP25]]
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp ne i64 [[TMP29]], 0
+; CHECK-NEXT:    [[TMP31:%.*]] = zext i1 [[TMP30]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP31]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 33)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+define i1 @bcmp38(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp38(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[A]], i64 16
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 16
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[A]], i64 24
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[B]], i64 24
+; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = xor i64 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[A]], i64 30
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 30
+; CHECK-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP19]], align 1
+; CHECK-NEXT:    [[TMP22:%.*]] = load i64, ptr [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP21]], [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP25:%.*]] = or i64 [[TMP13]], [[TMP18]]
+; CHECK-NEXT:    [[TMP26:%.*]] = or i64 [[TMP24]], [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = or i64 [[TMP26]], [[TMP23]]
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp ne i64 [[TMP27]], 0
+; CHECK-NEXT:    [[TMP29:%.*]] = zext i1 [[TMP28]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP29]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 38)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+define i1 @bcmp45(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp45(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[A]], i64 16
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 16
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[A]], i64 24
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[B]], i64 24
+; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = xor i64 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[A]], i64 32
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 32
+; CHECK-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP19]], align 1
+; CHECK-NEXT:    [[TMP22:%.*]] = load i64, ptr [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP21]], [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[A]], i64 37
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[B]], i64 37
+; CHECK-NEXT:    [[TMP26:%.*]] = load i64, ptr [[TMP24]], align 1
+; CHECK-NEXT:    [[TMP27:%.*]] = load i64, ptr [[TMP25]], align 1
+; CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    [[TMP29:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP30:%.*]] = or i64 [[TMP13]], [[TMP18]]
+; CHECK-NEXT:    [[TMP31:%.*]] = or i64 [[TMP23]], [[TMP28]]
+; CHECK-NEXT:    [[TMP32:%.*]] = or i64 [[TMP29]], [[TMP30]]
+; CHECK-NEXT:    [[TMP33:%.*]] = or i64 [[TMP32]], [[TMP31]]
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne i64 [[TMP33]], 0
+; CHECK-NEXT:    [[TMP35:%.*]] = zext i1 [[TMP34]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP35]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 45)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+; Although the large cmp chain may be not profitable on high end CPU, we
+; believe it is better on most cpus, so perform the transform now.
+; 8 xor + 7 or + 1 cmp only need 6 cycles on a 4 width ALU port machine
+;   2 cycle for xor
+;   3 cycle for or
+;   1 cycle for cmp
+define i1 @bcmp64(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp64(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[A]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[A]], i64 16
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 16
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[A]], i64 24
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[B]], i64 24
+; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = xor i64 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[A]], i64 32
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 32
+; CHECK-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP19]], align 1
+; CHECK-NEXT:    [[TMP22:%.*]] = load i64, ptr [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP21]], [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[A]], i64 40
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[B]], i64 40
+; CHECK-NEXT:    [[TMP26:%.*]] = load i64, ptr [[TMP24]], align 1
+; CHECK-NEXT:    [[TMP27:%.*]] = load i64, ptr [[TMP25]], align 1
+; CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[A]], i64 48
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[B]], i64 48
+; CHECK-NEXT:    [[TMP31:%.*]] = load i64, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = load i64, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP33:%.*]] = xor i64 [[TMP31]], [[TMP32]]
+; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr [[A]], i64 56
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr [[B]], i64 56
+; CHECK-NEXT:    [[TMP36:%.*]] = load i64, ptr [[TMP34]], align 1
+; CHECK-NEXT:    [[TMP37:%.*]] = load i64, ptr [[TMP35]], align 1
+; CHECK-NEXT:    [[TMP38:%.*]] = xor i64 [[TMP36]], [[TMP37]]
+; CHECK-NEXT:    [[TMP39:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP40:%.*]] = or i64 [[TMP13]], [[TMP18]]
+; CHECK-NEXT:    [[TMP41:%.*]] = or i64 [[TMP23]], [[TMP28]]
+; CHECK-NEXT:    [[TMP42:%.*]] = or i64 [[TMP33]], [[TMP38]]
+; CHECK-NEXT:    [[TMP43:%.*]] = or i64 [[TMP39]], [[TMP40]]
+; CHECK-NEXT:    [[TMP44:%.*]] = or i64 [[TMP41]], [[TMP42]]
+; CHECK-NEXT:    [[TMP45:%.*]] = or i64 [[TMP43]], [[TMP44]]
+; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne i64 [[TMP45]], 0
+; CHECK-NEXT:    [[TMP47:%.*]] = zext i1 [[TMP46]] to i32
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[TMP47]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 64)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+define i1 @bcmp89(ptr %a, ptr %b) {
+; CHECK-LABEL: define i1 @bcmp89(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) {
+; CHECK-NEXT:    [[CR:%.*]] = call i32 @bcmp(ptr [[A]], ptr [[B]], i64 89)
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i32 [[CR]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cr = call i32 @bcmp(ptr %a, ptr %b, i64 89)
+  %r = icmp eq i32 %cr, 0
+  ret i1 %r
+}
+
+define i1 @bcmp_zext(i32 %0, i32 %1, i8 %2, i8 %3) {
+; CHECK-LABEL: define i1 @bcmp_zext(
+; CHECK-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]], i8 [[TMP2:%.*]], i8 [[TMP3:%.*]]) {
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i8 [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i8 [[TMP6]] to i32
+; CHECK-NEXT:    [[TMP8:%.*]] = or i32 [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0
+; CHECK-NEXT:    ret i1 [[TMP9]]
+;
+  %5 = xor i32 %1, %0
+  %6 = xor i8 %3, %2
+  %7 = zext i8 %6 to i32
+  %8 = or i32 %5, %7
+  %9 = icmp eq i32 %8, 0
+  ret i1 %9
+}
+
+define i1 @bcmp_i8(i8 %a0, i8 %b0, i8 %a1, i8 %b1, i8 %a2, i8 %b2) {
+; CHECK-LABEL: define i1 @bcmp_i8(
+; CHECK-SAME: i8 [[A0:%.*]], i8 [[B0:%.*]], i8 [[A1:%.*]], i8 [[B1:%.*]], i8 [[A2:%.*]], i8 [[B2:%.*]]) {
+; CHECK-NEXT:    [[XOR0:%.*]] = xor i8 [[B0]], [[A0]]
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i8 [[B1]], [[A1]]
+; CHECK-NEXT:    [[XOR2:%.*]] = xor i8 [[B2]], [[A2]]
+; CHECK-NEXT:    [[OR0:%.*]] = or i8 [[XOR0]], [[XOR1]]
+; CHECK-NEXT:    [[OR1:%.*]] = or i8 [[OR0]], [[XOR2]]
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[OR1]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %xor0 = xor i8 %b0, %a0
+  %xor1 = xor i8 %b1, %a1
+  %xor2 = xor i8 %b2, %a2
+  %or0 = or i8 %xor0, %xor1
+  %or1 = or i8 %or0, %xor2
+  %r = icmp eq i8 %or1, 0
+  ret i1 %r
+}
+
+define i1 @bcmp_i16(i16 %a0, i16 %b0, i16 %a1, i16 %b1, i16 %a2, i16 %b2) {
+; CHECK-LABEL: define i1 @bcmp_i16(
+; CHECK-SAME: i16 [[A0:%.*]], i16 [[B0:%.*]], i16 [[A1:%.*]], i16 [[B1:%.*]], i16 [[A2:%.*]], i16 [[B2:%.*]]) {
+; CHECK-NEXT:    [[XOR0:%.*]] = xor i16 [[B0]], [[A0]]
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i16 [[B1]], [[A1]]
+; CHECK-NEXT:    [[XOR2:%.*]] = xor i16 [[B2]], [[A2]]
+; CHECK-NEXT:    [[OR0:%.*]] = or i16 [[XOR0]], [[XOR1]]
+; CHECK-NEXT:    [[OR1:%.*]] = or i16 [[OR0]], [[XOR2]]
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i16 [[OR1]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %xor0 = xor i16 %b0, %a0
+  %xor1 = xor i16 %b1, %a1
+  %xor2 = xor i16 %b2, %a2
+  %or0 = or i16 %xor0, %xor1
+  %or1 = or i16 %or0, %xor2
+  %r = icmp eq i16 %or1, 0
+  ret i1 %r
+}
+
+define i1 @bcmp_i128(i128 %a0, i128 %b0, i128 %a1, i128 %b1, i128 %a2, i128 %b2) {
+; CHECK-LABEL: define i1 @bcmp_i128(
+; CHECK-SAME: i128 [[A0:%.*]], i128 [[B0:%.*]], i128 [[A1:%.*]], i128 [[B1:%.*]], i128 [[A2:%.*]], i128 [[B2:%.*]]) {
+; CHECK-NEXT:    [[XOR0:%.*]] = xor i128 [[B0]], [[A0]]
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i128 [[B1]], [[A1]]
+; CHECK-NEXT:    [[XOR2:%.*]] = xor i128 [[B2]], [[A2]]
+; CHECK-NEXT:    [[OR0:%.*]] = or i128 [[XOR0]], [[XOR1]]
+; CHECK-NEXT:    [[OR1:%.*]] = or i128 [[OR0]], [[XOR2]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i128 [[OR1]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %xor0 = xor i128 %b0, %a0
+  %xor1 = xor i128 %b1, %a1
+  %xor2 = xor i128 %b2, %a2
+  %or0 = or i128 %xor0, %xor1
+  %or1 = or i128 %or0, %xor2
+  %r = icmp ne i128 %or1, 0
+  ret i1 %r
+}
+
+define i1 @bcmp_i42(i42 %a0, i42 %b0, i42 %a1, i42 %b1, i42 %a2, i42 %b2) {
+; CHECK-LABEL: define i1 @bcmp_i42(
+; CHECK-SAME: i42 [[A0:%.*]], i42 [[B0:%.*]], i42 [[A1:%.*]], i42 [[B1:%.*]], i42 [[A2:%.*]], i42 [[B2:%.*]]) {
+; CHECK-NEXT:    [[XOR0:%.*]] = xor i42 [[B0]], [[A0]]
+; CHECK-NEXT:    [[XOR1:%.*]] = xor i42 [[B1]], [[A1]]
+; CHECK-NEXT:    [[XOR2:%.*]] = xor i42 [[B2]], [[A2]]
+; CHECK-NEXT:    [[OR0:%.*]] = or i42 [[XOR0]], [[XOR1]]
+; CHECK-NEXT:    [[OR1:%.*]] = or i42 [[OR0]], [[XOR2]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i42 [[OR1]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %xor0 = xor i42 %b0, %a0
+  %xor1 = xor i42 %b1, %a1
+  %xor2 = xor i42 %b2, %a2
+  %or0 = or i42 %xor0, %xor1
+  %or1 = or i42 %or0, %xor2
+  %r = icmp ne i42 %or1, 0
+  ret i1 %r
+}
diff --git a/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp-extra.ll b/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp-extra.ll
new file mode 100644
index 00000000000000..e9573816c97880
--- /dev/null
+++ b/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp-extra.ll
@@ -0,0 +1,3434 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -passes=expand-memcmp  < %s -mtriple=aarch64-unknown-unknown | FileCheck %s
+
+ at .str = private constant [513 x i8] c"01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901\00", align 1
+
+declare dso_local i32 @memcmp(ptr, ptr, i64)
+
+define i32 @length0(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length0(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    ret i32 0
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
+  ret i32 %m
+  }
+
+define i1 @length0_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length0_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i1 true
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length0_lt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length0_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    ret i1 false
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length2(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length2(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    ret i32 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+  ret i32 %m
+}
+
+define i32 @length2_const(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length2_const(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; CHECK-NEXT:    ret i32 [[TMP4]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
+  ret i32 %m
+}
+
+define i1 @length2_gt_const(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length2_gt_const(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; CHECK-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP4]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length2_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length2_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_gt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length2_gt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: define i1 @length2_eq_const(
+; CHECK-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; CHECK-NEXT:    ret i1 [[TMP2]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR3:[0-9]+]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind nobuiltin
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length3(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length3(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i24, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i24, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i24 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i24 [[TMP2]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
+; CHECK-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
+; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    ret i32 [[TMP11]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
+  ret i32 %m
+}
+
+define i1 @length3_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length3_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; CHECK-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; CHECK-NEXT:    ret i1 [[TMP12]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length4(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length4(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; CHECK-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    ret i32 [[TMP9]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  ret i32 %m
+}
+
+define i1 @length4_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length4_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; CHECK-NEXT:    ret i1 [[TMP3]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_lt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length4_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    ret i1 [[TMP5]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length4_lt_32(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length4_lt_32(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i1 [[TMP5]] to i32
+; CHECK-NEXT:    ret i32 [[TMP6]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  %c = lshr i32 %m, 31
+  ret i32 %c
+}
+
+define i1 @length4_gt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length4_gt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    ret i1 [[TMP5]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: define i1 @length4_eq_const(
+; CHECK-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 4) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length5(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length5(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i40, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i40, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i40 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i40 [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ult i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
+; CHECK-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
+; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    ret i32 [[TMP11]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
+  ret i32 %m
+}
+
+define i1 @length5_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length5_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; CHECK-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; CHECK-NEXT:    ret i1 [[TMP12]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length5_lt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length5_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i40, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i40, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i40 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i40 [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ult i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    ret i1 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length6(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length6(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i48, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i48, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i48 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i48 [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ult i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
+; CHECK-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
+; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    ret i32 [[TMP11]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 6) nounwind
+  ret i32 %m
+}
+
+define i32 @length6_lt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length6_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i48, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i48, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i48 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i48 [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ult i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; CHECK-NEXT:    ret i32 [[TMP8]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 6) nounwind
+  %r = lshr i32 %m, 31
+  ret i32 %r
+}
+
+define i32 @length7(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length7(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
+  ret i32 %m
+}
+
+define i1 @length7_lt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length7_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length7_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length7_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; CHECK-NEXT:    ret i1 [[TMP10]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length8(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length8(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; CHECK-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    ret i32 [[TMP9]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
+  ret i32 %m
+}
+
+define i1 @length8_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length8_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length8_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: define i1 @length8_eq_const(
+; CHECK-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; CHECK-NEXT:    ret i1 [[TMP2]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 8) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length9(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length9(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP5:%.*]], [[TMP6:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; CHECK-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; CHECK-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9) nounwind
+  ret i32 %m
+}
+
+define i1 @length9_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length9_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length10(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length10(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP10]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP11]])
+; CHECK-NEXT:    [[TMP14]] = zext i16 [[TMP12]] to i64
+; CHECK-NEXT:    [[TMP15]] = zext i16 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 10) nounwind
+  ret i32 %m
+}
+
+define i1 @length10_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length10_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 10) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length11(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length11(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 11) nounwind
+  ret i32 %m
+}
+
+define i1 @length11_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length11_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 11) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length12_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length12_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; CHECK-NEXT:    ret i1 [[TMP12]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length12(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length12(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; CHECK-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; CHECK-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
+  ret i32 %m
+}
+
+define i1 @length13_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length13_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 13) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length14_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length14_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 14) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length15(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length15(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) nounwind
+  ret i32 %m
+}
+
+define i1 @length15_lt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length15_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length15_const(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length15_const(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP8:%.*]], [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 3544952156018063160, [[LOADBB]] ], [ 4051322327650219061, [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 3544952156018063160
+; CHECK-NEXT:    br i1 [[TMP5]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 4051322327650219061
+; CHECK-NEXT:    br i1 [[TMP9]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 15) nounwind
+  ret i32 %m
+}
+
+define i1 @length15_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length15_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length15_gt_const(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length15_gt_const(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP8:%.*]], [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 3544952156018063160, [[LOADBB]] ], [ 4051322327650219061, [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 3544952156018063160
+; CHECK-NEXT:    br i1 [[TMP5]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 4051322327650219061
+; CHECK-NEXT:    br i1 [[TMP9]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[C:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 15) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+
+define i32 @length16(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length16(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 16) nounwind
+  ret i32 %m
+}
+
+define i1 @length16_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length16_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; CHECK-NEXT:    ret i1 [[TMP10]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length16_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length16_gt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: define i1 @length16_eq_const(
+; CHECK-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 3978425819141910832
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 3833745473465760056
+; CHECK-NEXT:    [[TMP6:%.*]] = or i64 [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 16) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+
+define i32 @length24(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length24(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; CHECK:       loadbb2:
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; CHECK-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 24) nounwind
+  ret i32 %m
+}
+
+define i1 @length24_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length24_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or i64 [[TMP14]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP15]], 0
+; CHECK-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length24_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; CHECK:       loadbb2:
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; CHECK-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length24_gt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; CHECK:       loadbb2:
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; CHECK-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: define i1 @length24_eq_const(
+; CHECK-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 3978425819141910832
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 3833745473465760056
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 3689065127958034230
+; CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or i64 [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i64 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = zext i1 [[TMP11]] to i32
+; CHECK-NEXT:    ret i1 [[TMP11]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 24) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length31(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length31(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; CHECK:       loadbb2:
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; CHECK-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; CHECK:       loadbb3:
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; CHECK-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 31) nounwind
+  ret i32 %m
+}
+
+define i1 @length31_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length31_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = xor i64 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP20:%.*]] = or i64 [[TMP13]], [[TMP18]]
+; CHECK-NEXT:    [[TMP21:%.*]] = or i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp ne i64 [[TMP21]], 0
+; CHECK-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP23]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length31_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; CHECK:       loadbb2:
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; CHECK-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; CHECK:       loadbb3:
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; CHECK-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length31_gt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; CHECK:       loadbb2:
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; CHECK-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; CHECK:       loadbb3:
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; CHECK-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
+; CHECK-LABEL: define i1 @length31_eq_prefer128(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = xor i64 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP20:%.*]] = or i64 [[TMP13]], [[TMP18]]
+; CHECK-NEXT:    [[TMP21:%.*]] = or i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp ne i64 [[TMP21]], 0
+; CHECK-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP23]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: define i1 @length31_eq_const(
+; CHECK-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 3978425819141910832
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 3833745473465760056
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 3689065127958034230
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 3474870397276861491
+; CHECK-NEXT:    [[TMP12:%.*]] = or i64 [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or i64 [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; CHECK-NEXT:    ret i1 [[TMP15]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 31) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length32(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length32(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; CHECK:       loadbb2:
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; CHECK-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; CHECK:       loadbb3:
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; CHECK-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 32) nounwind
+  ret i32 %m
+}
+
+
+define i1 @length32_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length32_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = xor i64 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP20:%.*]] = or i64 [[TMP13]], [[TMP18]]
+; CHECK-NEXT:    [[TMP21:%.*]] = or i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp ne i64 [[TMP21]], 0
+; CHECK-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP23]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length32_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; CHECK:       loadbb2:
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; CHECK-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; CHECK:       loadbb3:
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; CHECK-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length32_gt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; CHECK:       loadbb2:
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; CHECK-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; CHECK:       loadbb3:
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; CHECK-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
+; CHECK-LABEL: define i1 @length32_eq_prefer128(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = xor i64 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP20:%.*]] = or i64 [[TMP13]], [[TMP18]]
+; CHECK-NEXT:    [[TMP21:%.*]] = or i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp ne i64 [[TMP21]], 0
+; CHECK-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP23]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: define i1 @length32_eq_const(
+; CHECK-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 3978425819141910832
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 3833745473465760056
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 3689065127958034230
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 3544395820347831604
+; CHECK-NEXT:    [[TMP12:%.*]] = or i64 [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or i64 [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; CHECK-NEXT:    ret i1 [[TMP15]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 32) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length48(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length48(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ], [ [[TMP33:%.*]], [[LOADBB4:%.*]] ], [ [[TMP40:%.*]], [[LOADBB5:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ], [ [[TMP34:%.*]], [[LOADBB4]] ], [ [[TMP41:%.*]], [[LOADBB5]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; CHECK:       loadbb2:
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; CHECK-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; CHECK:       loadbb3:
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; CHECK-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[LOADBB4]], label [[RES_BLOCK]]
+; CHECK:       loadbb4:
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; CHECK-NEXT:    [[TMP31:%.*]] = load i64, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = load i64, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP33]] = call i64 @llvm.bswap.i64(i64 [[TMP31]])
+; CHECK-NEXT:    [[TMP34]] = call i64 @llvm.bswap.i64(i64 [[TMP32]])
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[TMP33]], [[TMP34]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[LOADBB5]], label [[RES_BLOCK]]
+; CHECK:       loadbb5:
+; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr i8, ptr [[X]], i64 40
+; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr i8, ptr [[Y]], i64 40
+; CHECK-NEXT:    [[TMP38:%.*]] = load i64, ptr [[TMP36]], align 1
+; CHECK-NEXT:    [[TMP39:%.*]] = load i64, ptr [[TMP37]], align 1
+; CHECK-NEXT:    [[TMP40]] = call i64 @llvm.bswap.i64(i64 [[TMP38]])
+; CHECK-NEXT:    [[TMP41]] = call i64 @llvm.bswap.i64(i64 [[TMP39]])
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[TMP40]], [[TMP41]]
+; CHECK-NEXT:    br i1 [[TMP42]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB5]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 48) nounwind
+  ret i32 %m
+}
+
+define i1 @length48_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length48_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = xor i64 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; CHECK-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP19]], align 1
+; CHECK-NEXT:    [[TMP22:%.*]] = load i64, ptr [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP21]], [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[X]], i64 40
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[Y]], i64 40
+; CHECK-NEXT:    [[TMP26:%.*]] = load i64, ptr [[TMP24]], align 1
+; CHECK-NEXT:    [[TMP27:%.*]] = load i64, ptr [[TMP25]], align 1
+; CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    [[TMP29:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP30:%.*]] = or i64 [[TMP13]], [[TMP18]]
+; CHECK-NEXT:    [[TMP31:%.*]] = or i64 [[TMP23]], [[TMP28]]
+; CHECK-NEXT:    [[TMP32:%.*]] = or i64 [[TMP29]], [[TMP30]]
+; CHECK-NEXT:    [[TMP33:%.*]] = or i64 [[TMP32]], [[TMP31]]
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne i64 [[TMP33]], 0
+; CHECK-NEXT:    [[TMP35:%.*]] = zext i1 [[TMP34]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP35]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length48_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ], [ [[TMP33:%.*]], [[LOADBB4:%.*]] ], [ [[TMP40:%.*]], [[LOADBB5:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ], [ [[TMP34:%.*]], [[LOADBB4]] ], [ [[TMP41:%.*]], [[LOADBB5]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; CHECK:       loadbb2:
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; CHECK-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; CHECK:       loadbb3:
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; CHECK-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[LOADBB4]], label [[RES_BLOCK]]
+; CHECK:       loadbb4:
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; CHECK-NEXT:    [[TMP31:%.*]] = load i64, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = load i64, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP33]] = call i64 @llvm.bswap.i64(i64 [[TMP31]])
+; CHECK-NEXT:    [[TMP34]] = call i64 @llvm.bswap.i64(i64 [[TMP32]])
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[TMP33]], [[TMP34]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[LOADBB5]], label [[RES_BLOCK]]
+; CHECK:       loadbb5:
+; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr i8, ptr [[X]], i64 40
+; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr i8, ptr [[Y]], i64 40
+; CHECK-NEXT:    [[TMP38:%.*]] = load i64, ptr [[TMP36]], align 1
+; CHECK-NEXT:    [[TMP39:%.*]] = load i64, ptr [[TMP37]], align 1
+; CHECK-NEXT:    [[TMP40]] = call i64 @llvm.bswap.i64(i64 [[TMP38]])
+; CHECK-NEXT:    [[TMP41]] = call i64 @llvm.bswap.i64(i64 [[TMP39]])
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[TMP40]], [[TMP41]]
+; CHECK-NEXT:    br i1 [[TMP42]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB5]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length48_gt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ], [ [[TMP33:%.*]], [[LOADBB4:%.*]] ], [ [[TMP40:%.*]], [[LOADBB5:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ], [ [[TMP34:%.*]], [[LOADBB4]] ], [ [[TMP41:%.*]], [[LOADBB5]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; CHECK:       loadbb2:
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; CHECK-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; CHECK:       loadbb3:
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; CHECK-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[LOADBB4]], label [[RES_BLOCK]]
+; CHECK:       loadbb4:
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; CHECK-NEXT:    [[TMP31:%.*]] = load i64, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = load i64, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP33]] = call i64 @llvm.bswap.i64(i64 [[TMP31]])
+; CHECK-NEXT:    [[TMP34]] = call i64 @llvm.bswap.i64(i64 [[TMP32]])
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[TMP33]], [[TMP34]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[LOADBB5]], label [[RES_BLOCK]]
+; CHECK:       loadbb5:
+; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr i8, ptr [[X]], i64 40
+; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr i8, ptr [[Y]], i64 40
+; CHECK-NEXT:    [[TMP38:%.*]] = load i64, ptr [[TMP36]], align 1
+; CHECK-NEXT:    [[TMP39:%.*]] = load i64, ptr [[TMP37]], align 1
+; CHECK-NEXT:    [[TMP40]] = call i64 @llvm.bswap.i64(i64 [[TMP38]])
+; CHECK-NEXT:    [[TMP41]] = call i64 @llvm.bswap.i64(i64 [[TMP39]])
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[TMP40]], [[TMP41]]
+; CHECK-NEXT:    br i1 [[TMP42]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB5]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
+; CHECK-LABEL: define i1 @length48_eq_prefer128(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = xor i64 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; CHECK-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP19]], align 1
+; CHECK-NEXT:    [[TMP22:%.*]] = load i64, ptr [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP21]], [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[X]], i64 40
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[Y]], i64 40
+; CHECK-NEXT:    [[TMP26:%.*]] = load i64, ptr [[TMP24]], align 1
+; CHECK-NEXT:    [[TMP27:%.*]] = load i64, ptr [[TMP25]], align 1
+; CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    [[TMP29:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP30:%.*]] = or i64 [[TMP13]], [[TMP18]]
+; CHECK-NEXT:    [[TMP31:%.*]] = or i64 [[TMP23]], [[TMP28]]
+; CHECK-NEXT:    [[TMP32:%.*]] = or i64 [[TMP29]], [[TMP30]]
+; CHECK-NEXT:    [[TMP33:%.*]] = or i64 [[TMP32]], [[TMP31]]
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne i64 [[TMP33]], 0
+; CHECK-NEXT:    [[TMP35:%.*]] = zext i1 [[TMP34]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP35]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: define i1 @length48_eq_const(
+; CHECK-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 3978425819141910832
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 3833745473465760056
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 3689065127958034230
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 3544395820347831604
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; CHECK-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP12]], align 1
+; CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 4123106164818064178
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 40
+; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 3978425819141910832
+; CHECK-NEXT:    [[TMP18:%.*]] = or i64 [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP19:%.*]] = or i64 [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP20:%.*]] = or i64 [[TMP14]], [[TMP17]]
+; CHECK-NEXT:    [[TMP21:%.*]] = or i64 [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or i64 [[TMP21]], [[TMP20]]
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp ne i64 [[TMP22]], 0
+; CHECK-NEXT:    [[TMP24:%.*]] = zext i1 [[TMP23]] to i32
+; CHECK-NEXT:    ret i1 [[TMP23]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 48) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length63(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length63(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ], [ [[TMP33:%.*]], [[LOADBB4:%.*]] ], [ [[TMP40:%.*]], [[LOADBB5:%.*]] ], [ [[TMP47:%.*]], [[LOADBB6:%.*]] ], [ [[TMP54:%.*]], [[LOADBB7:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ], [ [[TMP34:%.*]], [[LOADBB4]] ], [ [[TMP41:%.*]], [[LOADBB5]] ], [ [[TMP48:%.*]], [[LOADBB6]] ], [ [[TMP55:%.*]], [[LOADBB7]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; CHECK:       loadbb2:
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; CHECK-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; CHECK:       loadbb3:
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; CHECK-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[LOADBB4]], label [[RES_BLOCK]]
+; CHECK:       loadbb4:
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; CHECK-NEXT:    [[TMP31:%.*]] = load i64, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = load i64, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP33]] = call i64 @llvm.bswap.i64(i64 [[TMP31]])
+; CHECK-NEXT:    [[TMP34]] = call i64 @llvm.bswap.i64(i64 [[TMP32]])
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[TMP33]], [[TMP34]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[LOADBB5]], label [[RES_BLOCK]]
+; CHECK:       loadbb5:
+; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr i8, ptr [[X]], i64 40
+; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr i8, ptr [[Y]], i64 40
+; CHECK-NEXT:    [[TMP38:%.*]] = load i64, ptr [[TMP36]], align 1
+; CHECK-NEXT:    [[TMP39:%.*]] = load i64, ptr [[TMP37]], align 1
+; CHECK-NEXT:    [[TMP40]] = call i64 @llvm.bswap.i64(i64 [[TMP38]])
+; CHECK-NEXT:    [[TMP41]] = call i64 @llvm.bswap.i64(i64 [[TMP39]])
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[TMP40]], [[TMP41]]
+; CHECK-NEXT:    br i1 [[TMP42]], label [[LOADBB6]], label [[RES_BLOCK]]
+; CHECK:       loadbb6:
+; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[X]], i64 48
+; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr [[Y]], i64 48
+; CHECK-NEXT:    [[TMP45:%.*]] = load i64, ptr [[TMP43]], align 1
+; CHECK-NEXT:    [[TMP46:%.*]] = load i64, ptr [[TMP44]], align 1
+; CHECK-NEXT:    [[TMP47]] = call i64 @llvm.bswap.i64(i64 [[TMP45]])
+; CHECK-NEXT:    [[TMP48]] = call i64 @llvm.bswap.i64(i64 [[TMP46]])
+; CHECK-NEXT:    [[TMP49:%.*]] = icmp eq i64 [[TMP47]], [[TMP48]]
+; CHECK-NEXT:    br i1 [[TMP49]], label [[LOADBB7]], label [[RES_BLOCK]]
+; CHECK:       loadbb7:
+; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr i8, ptr [[X]], i64 55
+; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr [[Y]], i64 55
+; CHECK-NEXT:    [[TMP52:%.*]] = load i64, ptr [[TMP50]], align 1
+; CHECK-NEXT:    [[TMP53:%.*]] = load i64, ptr [[TMP51]], align 1
+; CHECK-NEXT:    [[TMP54]] = call i64 @llvm.bswap.i64(i64 [[TMP52]])
+; CHECK-NEXT:    [[TMP55]] = call i64 @llvm.bswap.i64(i64 [[TMP53]])
+; CHECK-NEXT:    [[TMP56:%.*]] = icmp eq i64 [[TMP54]], [[TMP55]]
+; CHECK-NEXT:    br i1 [[TMP56]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB7]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 63) nounwind
+  ret i32 %m
+}
+
+define i1 @length63_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length63_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = xor i64 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; CHECK-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP19]], align 1
+; CHECK-NEXT:    [[TMP22:%.*]] = load i64, ptr [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP21]], [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[X]], i64 40
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[Y]], i64 40
+; CHECK-NEXT:    [[TMP26:%.*]] = load i64, ptr [[TMP24]], align 1
+; CHECK-NEXT:    [[TMP27:%.*]] = load i64, ptr [[TMP25]], align 1
+; CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[X]], i64 48
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[Y]], i64 48
+; CHECK-NEXT:    [[TMP31:%.*]] = load i64, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = load i64, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP33:%.*]] = xor i64 [[TMP31]], [[TMP32]]
+; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr [[X]], i64 55
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr [[Y]], i64 55
+; CHECK-NEXT:    [[TMP36:%.*]] = load i64, ptr [[TMP34]], align 1
+; CHECK-NEXT:    [[TMP37:%.*]] = load i64, ptr [[TMP35]], align 1
+; CHECK-NEXT:    [[TMP38:%.*]] = xor i64 [[TMP36]], [[TMP37]]
+; CHECK-NEXT:    [[TMP39:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP40:%.*]] = or i64 [[TMP13]], [[TMP18]]
+; CHECK-NEXT:    [[TMP41:%.*]] = or i64 [[TMP23]], [[TMP28]]
+; CHECK-NEXT:    [[TMP42:%.*]] = or i64 [[TMP33]], [[TMP38]]
+; CHECK-NEXT:    [[TMP43:%.*]] = or i64 [[TMP39]], [[TMP40]]
+; CHECK-NEXT:    [[TMP44:%.*]] = or i64 [[TMP41]], [[TMP42]]
+; CHECK-NEXT:    [[TMP45:%.*]] = or i64 [[TMP43]], [[TMP44]]
+; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne i64 [[TMP45]], 0
+; CHECK-NEXT:    [[TMP47:%.*]] = zext i1 [[TMP46]] to i32
+; CHECK-NEXT:    ret i1 [[TMP46]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length63_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length63_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ], [ [[TMP33:%.*]], [[LOADBB4:%.*]] ], [ [[TMP40:%.*]], [[LOADBB5:%.*]] ], [ [[TMP47:%.*]], [[LOADBB6:%.*]] ], [ [[TMP54:%.*]], [[LOADBB7:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ], [ [[TMP34:%.*]], [[LOADBB4]] ], [ [[TMP41:%.*]], [[LOADBB5]] ], [ [[TMP48:%.*]], [[LOADBB6]] ], [ [[TMP55:%.*]], [[LOADBB7]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; CHECK:       loadbb2:
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; CHECK-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; CHECK:       loadbb3:
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; CHECK-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[LOADBB4]], label [[RES_BLOCK]]
+; CHECK:       loadbb4:
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; CHECK-NEXT:    [[TMP31:%.*]] = load i64, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = load i64, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP33]] = call i64 @llvm.bswap.i64(i64 [[TMP31]])
+; CHECK-NEXT:    [[TMP34]] = call i64 @llvm.bswap.i64(i64 [[TMP32]])
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[TMP33]], [[TMP34]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[LOADBB5]], label [[RES_BLOCK]]
+; CHECK:       loadbb5:
+; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr i8, ptr [[X]], i64 40
+; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr i8, ptr [[Y]], i64 40
+; CHECK-NEXT:    [[TMP38:%.*]] = load i64, ptr [[TMP36]], align 1
+; CHECK-NEXT:    [[TMP39:%.*]] = load i64, ptr [[TMP37]], align 1
+; CHECK-NEXT:    [[TMP40]] = call i64 @llvm.bswap.i64(i64 [[TMP38]])
+; CHECK-NEXT:    [[TMP41]] = call i64 @llvm.bswap.i64(i64 [[TMP39]])
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[TMP40]], [[TMP41]]
+; CHECK-NEXT:    br i1 [[TMP42]], label [[LOADBB6]], label [[RES_BLOCK]]
+; CHECK:       loadbb6:
+; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[X]], i64 48
+; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr [[Y]], i64 48
+; CHECK-NEXT:    [[TMP45:%.*]] = load i64, ptr [[TMP43]], align 1
+; CHECK-NEXT:    [[TMP46:%.*]] = load i64, ptr [[TMP44]], align 1
+; CHECK-NEXT:    [[TMP47]] = call i64 @llvm.bswap.i64(i64 [[TMP45]])
+; CHECK-NEXT:    [[TMP48]] = call i64 @llvm.bswap.i64(i64 [[TMP46]])
+; CHECK-NEXT:    [[TMP49:%.*]] = icmp eq i64 [[TMP47]], [[TMP48]]
+; CHECK-NEXT:    br i1 [[TMP49]], label [[LOADBB7]], label [[RES_BLOCK]]
+; CHECK:       loadbb7:
+; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr i8, ptr [[X]], i64 55
+; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr [[Y]], i64 55
+; CHECK-NEXT:    [[TMP52:%.*]] = load i64, ptr [[TMP50]], align 1
+; CHECK-NEXT:    [[TMP53:%.*]] = load i64, ptr [[TMP51]], align 1
+; CHECK-NEXT:    [[TMP54]] = call i64 @llvm.bswap.i64(i64 [[TMP52]])
+; CHECK-NEXT:    [[TMP55]] = call i64 @llvm.bswap.i64(i64 [[TMP53]])
+; CHECK-NEXT:    [[TMP56:%.*]] = icmp eq i64 [[TMP54]], [[TMP55]]
+; CHECK-NEXT:    br i1 [[TMP56]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB7]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length63_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length63_gt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ], [ [[TMP33:%.*]], [[LOADBB4:%.*]] ], [ [[TMP40:%.*]], [[LOADBB5:%.*]] ], [ [[TMP47:%.*]], [[LOADBB6:%.*]] ], [ [[TMP54:%.*]], [[LOADBB7:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ], [ [[TMP34:%.*]], [[LOADBB4]] ], [ [[TMP41:%.*]], [[LOADBB5]] ], [ [[TMP48:%.*]], [[LOADBB6]] ], [ [[TMP55:%.*]], [[LOADBB7]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; CHECK:       loadbb2:
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; CHECK-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; CHECK:       loadbb3:
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; CHECK-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[LOADBB4]], label [[RES_BLOCK]]
+; CHECK:       loadbb4:
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; CHECK-NEXT:    [[TMP31:%.*]] = load i64, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = load i64, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP33]] = call i64 @llvm.bswap.i64(i64 [[TMP31]])
+; CHECK-NEXT:    [[TMP34]] = call i64 @llvm.bswap.i64(i64 [[TMP32]])
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[TMP33]], [[TMP34]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[LOADBB5]], label [[RES_BLOCK]]
+; CHECK:       loadbb5:
+; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr i8, ptr [[X]], i64 40
+; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr i8, ptr [[Y]], i64 40
+; CHECK-NEXT:    [[TMP38:%.*]] = load i64, ptr [[TMP36]], align 1
+; CHECK-NEXT:    [[TMP39:%.*]] = load i64, ptr [[TMP37]], align 1
+; CHECK-NEXT:    [[TMP40]] = call i64 @llvm.bswap.i64(i64 [[TMP38]])
+; CHECK-NEXT:    [[TMP41]] = call i64 @llvm.bswap.i64(i64 [[TMP39]])
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[TMP40]], [[TMP41]]
+; CHECK-NEXT:    br i1 [[TMP42]], label [[LOADBB6]], label [[RES_BLOCK]]
+; CHECK:       loadbb6:
+; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[X]], i64 48
+; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr [[Y]], i64 48
+; CHECK-NEXT:    [[TMP45:%.*]] = load i64, ptr [[TMP43]], align 1
+; CHECK-NEXT:    [[TMP46:%.*]] = load i64, ptr [[TMP44]], align 1
+; CHECK-NEXT:    [[TMP47]] = call i64 @llvm.bswap.i64(i64 [[TMP45]])
+; CHECK-NEXT:    [[TMP48]] = call i64 @llvm.bswap.i64(i64 [[TMP46]])
+; CHECK-NEXT:    [[TMP49:%.*]] = icmp eq i64 [[TMP47]], [[TMP48]]
+; CHECK-NEXT:    br i1 [[TMP49]], label [[LOADBB7]], label [[RES_BLOCK]]
+; CHECK:       loadbb7:
+; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr i8, ptr [[X]], i64 55
+; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr [[Y]], i64 55
+; CHECK-NEXT:    [[TMP52:%.*]] = load i64, ptr [[TMP50]], align 1
+; CHECK-NEXT:    [[TMP53:%.*]] = load i64, ptr [[TMP51]], align 1
+; CHECK-NEXT:    [[TMP54]] = call i64 @llvm.bswap.i64(i64 [[TMP52]])
+; CHECK-NEXT:    [[TMP55]] = call i64 @llvm.bswap.i64(i64 [[TMP53]])
+; CHECK-NEXT:    [[TMP56:%.*]] = icmp eq i64 [[TMP54]], [[TMP55]]
+; CHECK-NEXT:    br i1 [[TMP56]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB7]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length63_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: define i1 @length63_eq_const(
+; CHECK-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 3978425819141910832
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 3833745473465760056
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 3689065127958034230
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 3544395820347831604
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; CHECK-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP12]], align 1
+; CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 4123106164818064178
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 40
+; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 3978425819141910832
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[X]], i64 48
+; CHECK-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP18]], align 1
+; CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 3833745473465760056
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[X]], i64 55
+; CHECK-NEXT:    [[TMP22:%.*]] = load i64, ptr [[TMP21]], align 1
+; CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 3616724998069630517
+; CHECK-NEXT:    [[TMP24:%.*]] = or i64 [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP25:%.*]] = or i64 [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP26:%.*]] = or i64 [[TMP14]], [[TMP17]]
+; CHECK-NEXT:    [[TMP27:%.*]] = or i64 [[TMP20]], [[TMP23]]
+; CHECK-NEXT:    [[TMP28:%.*]] = or i64 [[TMP24]], [[TMP25]]
+; CHECK-NEXT:    [[TMP29:%.*]] = or i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    [[TMP30:%.*]] = or i64 [[TMP28]], [[TMP29]]
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp ne i64 [[TMP30]], 0
+; CHECK-NEXT:    [[TMP32:%.*]] = zext i1 [[TMP31]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP32]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 63) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length64(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length64(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ], [ [[TMP33:%.*]], [[LOADBB4:%.*]] ], [ [[TMP40:%.*]], [[LOADBB5:%.*]] ], [ [[TMP47:%.*]], [[LOADBB6:%.*]] ], [ [[TMP54:%.*]], [[LOADBB7:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ], [ [[TMP34:%.*]], [[LOADBB4]] ], [ [[TMP41:%.*]], [[LOADBB5]] ], [ [[TMP48:%.*]], [[LOADBB6]] ], [ [[TMP55:%.*]], [[LOADBB7]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; CHECK:       loadbb2:
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; CHECK-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; CHECK:       loadbb3:
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; CHECK-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[LOADBB4]], label [[RES_BLOCK]]
+; CHECK:       loadbb4:
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; CHECK-NEXT:    [[TMP31:%.*]] = load i64, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = load i64, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP33]] = call i64 @llvm.bswap.i64(i64 [[TMP31]])
+; CHECK-NEXT:    [[TMP34]] = call i64 @llvm.bswap.i64(i64 [[TMP32]])
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[TMP33]], [[TMP34]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[LOADBB5]], label [[RES_BLOCK]]
+; CHECK:       loadbb5:
+; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr i8, ptr [[X]], i64 40
+; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr i8, ptr [[Y]], i64 40
+; CHECK-NEXT:    [[TMP38:%.*]] = load i64, ptr [[TMP36]], align 1
+; CHECK-NEXT:    [[TMP39:%.*]] = load i64, ptr [[TMP37]], align 1
+; CHECK-NEXT:    [[TMP40]] = call i64 @llvm.bswap.i64(i64 [[TMP38]])
+; CHECK-NEXT:    [[TMP41]] = call i64 @llvm.bswap.i64(i64 [[TMP39]])
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[TMP40]], [[TMP41]]
+; CHECK-NEXT:    br i1 [[TMP42]], label [[LOADBB6]], label [[RES_BLOCK]]
+; CHECK:       loadbb6:
+; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[X]], i64 48
+; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr [[Y]], i64 48
+; CHECK-NEXT:    [[TMP45:%.*]] = load i64, ptr [[TMP43]], align 1
+; CHECK-NEXT:    [[TMP46:%.*]] = load i64, ptr [[TMP44]], align 1
+; CHECK-NEXT:    [[TMP47]] = call i64 @llvm.bswap.i64(i64 [[TMP45]])
+; CHECK-NEXT:    [[TMP48]] = call i64 @llvm.bswap.i64(i64 [[TMP46]])
+; CHECK-NEXT:    [[TMP49:%.*]] = icmp eq i64 [[TMP47]], [[TMP48]]
+; CHECK-NEXT:    br i1 [[TMP49]], label [[LOADBB7]], label [[RES_BLOCK]]
+; CHECK:       loadbb7:
+; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr i8, ptr [[X]], i64 56
+; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr [[Y]], i64 56
+; CHECK-NEXT:    [[TMP52:%.*]] = load i64, ptr [[TMP50]], align 1
+; CHECK-NEXT:    [[TMP53:%.*]] = load i64, ptr [[TMP51]], align 1
+; CHECK-NEXT:    [[TMP54]] = call i64 @llvm.bswap.i64(i64 [[TMP52]])
+; CHECK-NEXT:    [[TMP55]] = call i64 @llvm.bswap.i64(i64 [[TMP53]])
+; CHECK-NEXT:    [[TMP56:%.*]] = icmp eq i64 [[TMP54]], [[TMP55]]
+; CHECK-NEXT:    br i1 [[TMP56]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB7]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 64) nounwind
+  ret i32 %m
+}
+
+define i1 @length64_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length64_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP14]], align 1
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = xor i64 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; CHECK-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP19]], align 1
+; CHECK-NEXT:    [[TMP22:%.*]] = load i64, ptr [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP21]], [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[X]], i64 40
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[Y]], i64 40
+; CHECK-NEXT:    [[TMP26:%.*]] = load i64, ptr [[TMP24]], align 1
+; CHECK-NEXT:    [[TMP27:%.*]] = load i64, ptr [[TMP25]], align 1
+; CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[X]], i64 48
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[Y]], i64 48
+; CHECK-NEXT:    [[TMP31:%.*]] = load i64, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = load i64, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP33:%.*]] = xor i64 [[TMP31]], [[TMP32]]
+; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr [[X]], i64 56
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr [[Y]], i64 56
+; CHECK-NEXT:    [[TMP36:%.*]] = load i64, ptr [[TMP34]], align 1
+; CHECK-NEXT:    [[TMP37:%.*]] = load i64, ptr [[TMP35]], align 1
+; CHECK-NEXT:    [[TMP38:%.*]] = xor i64 [[TMP36]], [[TMP37]]
+; CHECK-NEXT:    [[TMP39:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP40:%.*]] = or i64 [[TMP13]], [[TMP18]]
+; CHECK-NEXT:    [[TMP41:%.*]] = or i64 [[TMP23]], [[TMP28]]
+; CHECK-NEXT:    [[TMP42:%.*]] = or i64 [[TMP33]], [[TMP38]]
+; CHECK-NEXT:    [[TMP43:%.*]] = or i64 [[TMP39]], [[TMP40]]
+; CHECK-NEXT:    [[TMP44:%.*]] = or i64 [[TMP41]], [[TMP42]]
+; CHECK-NEXT:    [[TMP45:%.*]] = or i64 [[TMP43]], [[TMP44]]
+; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne i64 [[TMP45]], 0
+; CHECK-NEXT:    [[TMP47:%.*]] = zext i1 [[TMP46]] to i32
+; CHECK-NEXT:    ret i1 [[TMP46]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length64_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ], [ [[TMP33:%.*]], [[LOADBB4:%.*]] ], [ [[TMP40:%.*]], [[LOADBB5:%.*]] ], [ [[TMP47:%.*]], [[LOADBB6:%.*]] ], [ [[TMP54:%.*]], [[LOADBB7:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ], [ [[TMP34:%.*]], [[LOADBB4]] ], [ [[TMP41:%.*]], [[LOADBB5]] ], [ [[TMP48:%.*]], [[LOADBB6]] ], [ [[TMP55:%.*]], [[LOADBB7]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; CHECK:       loadbb2:
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; CHECK-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; CHECK:       loadbb3:
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; CHECK-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[LOADBB4]], label [[RES_BLOCK]]
+; CHECK:       loadbb4:
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; CHECK-NEXT:    [[TMP31:%.*]] = load i64, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = load i64, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP33]] = call i64 @llvm.bswap.i64(i64 [[TMP31]])
+; CHECK-NEXT:    [[TMP34]] = call i64 @llvm.bswap.i64(i64 [[TMP32]])
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[TMP33]], [[TMP34]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[LOADBB5]], label [[RES_BLOCK]]
+; CHECK:       loadbb5:
+; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr i8, ptr [[X]], i64 40
+; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr i8, ptr [[Y]], i64 40
+; CHECK-NEXT:    [[TMP38:%.*]] = load i64, ptr [[TMP36]], align 1
+; CHECK-NEXT:    [[TMP39:%.*]] = load i64, ptr [[TMP37]], align 1
+; CHECK-NEXT:    [[TMP40]] = call i64 @llvm.bswap.i64(i64 [[TMP38]])
+; CHECK-NEXT:    [[TMP41]] = call i64 @llvm.bswap.i64(i64 [[TMP39]])
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[TMP40]], [[TMP41]]
+; CHECK-NEXT:    br i1 [[TMP42]], label [[LOADBB6]], label [[RES_BLOCK]]
+; CHECK:       loadbb6:
+; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[X]], i64 48
+; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr [[Y]], i64 48
+; CHECK-NEXT:    [[TMP45:%.*]] = load i64, ptr [[TMP43]], align 1
+; CHECK-NEXT:    [[TMP46:%.*]] = load i64, ptr [[TMP44]], align 1
+; CHECK-NEXT:    [[TMP47]] = call i64 @llvm.bswap.i64(i64 [[TMP45]])
+; CHECK-NEXT:    [[TMP48]] = call i64 @llvm.bswap.i64(i64 [[TMP46]])
+; CHECK-NEXT:    [[TMP49:%.*]] = icmp eq i64 [[TMP47]], [[TMP48]]
+; CHECK-NEXT:    br i1 [[TMP49]], label [[LOADBB7]], label [[RES_BLOCK]]
+; CHECK:       loadbb7:
+; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr i8, ptr [[X]], i64 56
+; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr [[Y]], i64 56
+; CHECK-NEXT:    [[TMP52:%.*]] = load i64, ptr [[TMP50]], align 1
+; CHECK-NEXT:    [[TMP53:%.*]] = load i64, ptr [[TMP51]], align 1
+; CHECK-NEXT:    [[TMP54]] = call i64 @llvm.bswap.i64(i64 [[TMP52]])
+; CHECK-NEXT:    [[TMP55]] = call i64 @llvm.bswap.i64(i64 [[TMP53]])
+; CHECK-NEXT:    [[TMP56:%.*]] = icmp eq i64 [[TMP54]], [[TMP55]]
+; CHECK-NEXT:    br i1 [[TMP56]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB7]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length64_gt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ], [ [[TMP33:%.*]], [[LOADBB4:%.*]] ], [ [[TMP40:%.*]], [[LOADBB5:%.*]] ], [ [[TMP47:%.*]], [[LOADBB6:%.*]] ], [ [[TMP54:%.*]], [[LOADBB7:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ], [ [[TMP34:%.*]], [[LOADBB4]] ], [ [[TMP41:%.*]], [[LOADBB5]] ], [ [[TMP48:%.*]], [[LOADBB6]] ], [ [[TMP55:%.*]], [[LOADBB7]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; CHECK:       loadbb2:
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; CHECK-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; CHECK:       loadbb3:
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; CHECK-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; CHECK-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; CHECK-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[LOADBB4]], label [[RES_BLOCK]]
+; CHECK:       loadbb4:
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; CHECK-NEXT:    [[TMP31:%.*]] = load i64, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[TMP32:%.*]] = load i64, ptr [[TMP30]], align 1
+; CHECK-NEXT:    [[TMP33]] = call i64 @llvm.bswap.i64(i64 [[TMP31]])
+; CHECK-NEXT:    [[TMP34]] = call i64 @llvm.bswap.i64(i64 [[TMP32]])
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[TMP33]], [[TMP34]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[LOADBB5]], label [[RES_BLOCK]]
+; CHECK:       loadbb5:
+; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr i8, ptr [[X]], i64 40
+; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr i8, ptr [[Y]], i64 40
+; CHECK-NEXT:    [[TMP38:%.*]] = load i64, ptr [[TMP36]], align 1
+; CHECK-NEXT:    [[TMP39:%.*]] = load i64, ptr [[TMP37]], align 1
+; CHECK-NEXT:    [[TMP40]] = call i64 @llvm.bswap.i64(i64 [[TMP38]])
+; CHECK-NEXT:    [[TMP41]] = call i64 @llvm.bswap.i64(i64 [[TMP39]])
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[TMP40]], [[TMP41]]
+; CHECK-NEXT:    br i1 [[TMP42]], label [[LOADBB6]], label [[RES_BLOCK]]
+; CHECK:       loadbb6:
+; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[X]], i64 48
+; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr [[Y]], i64 48
+; CHECK-NEXT:    [[TMP45:%.*]] = load i64, ptr [[TMP43]], align 1
+; CHECK-NEXT:    [[TMP46:%.*]] = load i64, ptr [[TMP44]], align 1
+; CHECK-NEXT:    [[TMP47]] = call i64 @llvm.bswap.i64(i64 [[TMP45]])
+; CHECK-NEXT:    [[TMP48]] = call i64 @llvm.bswap.i64(i64 [[TMP46]])
+; CHECK-NEXT:    [[TMP49:%.*]] = icmp eq i64 [[TMP47]], [[TMP48]]
+; CHECK-NEXT:    br i1 [[TMP49]], label [[LOADBB7]], label [[RES_BLOCK]]
+; CHECK:       loadbb7:
+; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr i8, ptr [[X]], i64 56
+; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr [[Y]], i64 56
+; CHECK-NEXT:    [[TMP52:%.*]] = load i64, ptr [[TMP50]], align 1
+; CHECK-NEXT:    [[TMP53:%.*]] = load i64, ptr [[TMP51]], align 1
+; CHECK-NEXT:    [[TMP54]] = call i64 @llvm.bswap.i64(i64 [[TMP52]])
+; CHECK-NEXT:    [[TMP55]] = call i64 @llvm.bswap.i64(i64 [[TMP53]])
+; CHECK-NEXT:    [[TMP56:%.*]] = icmp eq i64 [[TMP54]], [[TMP55]]
+; CHECK-NEXT:    br i1 [[TMP56]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB7]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: define i1 @length64_eq_const(
+; CHECK-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 3978425819141910832
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 3833745473465760056
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 3689065127958034230
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 3544395820347831604
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; CHECK-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP12]], align 1
+; CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 4123106164818064178
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 40
+; CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP15]], align 1
+; CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 3978425819141910832
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[X]], i64 48
+; CHECK-NEXT:    [[TMP19:%.*]] = load i64, ptr [[TMP18]], align 1
+; CHECK-NEXT:    [[TMP20:%.*]] = xor i64 [[TMP19]], 3833745473465760056
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[X]], i64 56
+; CHECK-NEXT:    [[TMP22:%.*]] = load i64, ptr [[TMP21]], align 1
+; CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 3689065127958034230
+; CHECK-NEXT:    [[TMP24:%.*]] = or i64 [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP25:%.*]] = or i64 [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP26:%.*]] = or i64 [[TMP14]], [[TMP17]]
+; CHECK-NEXT:    [[TMP27:%.*]] = or i64 [[TMP20]], [[TMP23]]
+; CHECK-NEXT:    [[TMP28:%.*]] = or i64 [[TMP24]], [[TMP25]]
+; CHECK-NEXT:    [[TMP29:%.*]] = or i64 [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    [[TMP30:%.*]] = or i64 [[TMP28]], [[TMP29]]
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp ne i64 [[TMP30]], 0
+; CHECK-NEXT:    [[TMP32:%.*]] = zext i1 [[TMP31]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP32]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 64) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length96(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length96(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR0]]
+; CHECK-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 96) nounwind
+  ret i32 %m
+}
+
+define i1 @length96_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length96_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length96_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length96_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length96_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length96_gt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length96_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: define i1 @length96_eq_const(
+; CHECK-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 96) #[[ATTR0]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 96) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length127(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length127(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR0]]
+; CHECK-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 127) nounwind
+  ret i32 %m
+}
+
+define i1 @length127_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length127_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length127_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length127_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length127_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length127_gt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length127_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: define i1 @length127_eq_const(
+; CHECK-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 127) #[[ATTR0]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 127) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length128(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length128(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR0]]
+; CHECK-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 128) nounwind
+  ret i32 %m
+}
+
+define i1 @length128_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length128_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length128_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length128_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length128_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length128_gt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length128_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: define i1 @length128_eq_const(
+; CHECK-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 128) #[[ATTR0]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 128) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length192(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length192(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR0]]
+; CHECK-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 192) nounwind
+  ret i32 %m
+}
+
+define i1 @length192_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length192_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length192_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length192_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length192_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length192_gt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length192_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: define i1 @length192_eq_const(
+; CHECK-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 192) #[[ATTR0]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 192) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length255(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length255(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR0]]
+; CHECK-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 255) nounwind
+  ret i32 %m
+}
+
+define i1 @length255_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length255_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length255_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length255_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length255_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length255_gt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length255_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: define i1 @length255_eq_const(
+; CHECK-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 255) #[[ATTR0]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 255) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length256(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length256(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR0]]
+; CHECK-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 256) nounwind
+  ret i32 %m
+}
+
+define i1 @length256_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length256_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length256_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length256_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length256_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length256_gt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length256_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: define i1 @length256_eq_const(
+; CHECK-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 256) #[[ATTR0]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 256) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length384(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length384(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR0]]
+; CHECK-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 384) nounwind
+  ret i32 %m
+}
+
+define i1 @length384_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length384_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length384_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length384_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length384_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length384_gt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length384_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: define i1 @length384_eq_const(
+; CHECK-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 384) #[[ATTR0]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 384) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length511(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length511(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR0]]
+; CHECK-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 511) nounwind
+  ret i32 %m
+}
+
+define i1 @length511_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length511_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length511_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length511_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length511_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length511_gt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length511_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: define i1 @length511_eq_const(
+; CHECK-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 511) #[[ATTR0]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 511) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length512(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @length512(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR0]]
+; CHECK-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 512) nounwind
+  ret i32 %m
+}
+
+define i1 @length512_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length512_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length512_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length512_lt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length512_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: define i1 @length512_gt(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length512_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: define i1 @length512_eq_const(
+; CHECK-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 512) #[[ATTR0]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 512) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @huge_length(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i32 @huge_length(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR0]]
+; CHECK-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9223372036854775807) nounwind
+  ret i32 %m
+}
+
+define i1 @huge_length_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @huge_length_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR0]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9223372036854775807) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @nonconst_length(ptr %X, ptr %Y, i64 %size) nounwind {
+; CHECK-LABEL: define i32 @nonconst_length(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR0]]
+; CHECK-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 %size) nounwind
+  ret i32 %m
+}
+
+define i1 @nonconst_length_eq(ptr %X, ptr %Y, i64 %size) nounwind {
+; CHECK-LABEL: define i1 @nonconst_length_eq(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR0]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 %size) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
diff --git a/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll b/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll
index 92439691e1873c..735fb27da16060 100644
--- a/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll
+++ b/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll
@@ -1,5 +1,4 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt -S -expand-memcmp -memcmp-num-loads-per-block=1 -mtriple=aarch64-unknown-unknown < %s | FileCheck %s
 ; RUN: opt -S -passes=expand-memcmp -memcmp-num-loads-per-block=1 -mtriple=aarch64-unknown-unknown < %s | FileCheck %s
 
 declare i32 @memcmp(ptr nocapture, ptr nocapture, i64)
diff --git a/llvm/test/Transforms/ExpandMemCmp/BPF/lit.local.cfg b/llvm/test/Transforms/ExpandMemCmp/BPF/lit.local.cfg
new file mode 100644
index 00000000000000..d1828f2b613d9e
--- /dev/null
+++ b/llvm/test/Transforms/ExpandMemCmp/BPF/lit.local.cfg
@@ -0,0 +1,4 @@
+if not "BPF" in config.root.targets:
+    config.unsupported = True
+if "system-aix" in config.available_features:
+    config.unsupported = True
diff --git a/llvm/test/Transforms/ExpandMemCmp/BPF/memcmp.ll b/llvm/test/Transforms/ExpandMemCmp/BPF/memcmp.ll
new file mode 100644
index 00000000000000..1accfe88d1a82a
--- /dev/null
+++ b/llvm/test/Transforms/ExpandMemCmp/BPF/memcmp.ll
@@ -0,0 +1,119 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -passes=expand-memcmp -mtriple=bpf < %s | FileCheck %s --check-prefix=BPF
+; RUN: opt -S -passes=expand-memcmp -mtriple=bpf -mcpu=v3 < %s | FileCheck %s --check-prefix=BPF-V3
+;
+; Source code:
+;   /* set aligned 4 to minimize the number of loads */
+;   struct build_id {
+;     unsigned char id[20];
+;   } __attribute__((aligned(4)));
+;
+;   /* try to compute a local build_id */
+;   void bar1(ptr);
+;
+;   /* the global build_id to compare */
+;   struct build_id id2;
+;
+;   int foo()
+;   {
+;     struct build_id id1;
+;
+;     bar1(&id1);
+;     return __builtin_memcmp(&id1, &id2, sizeof(id1)) == 0;
+;   }
+; Compilation flags:
+;   clang -target bpf -S -O2 t.c -emit-llvm
+
+%struct.build_id = type { [20 x i8] }
+
+ at id2 = dso_local global %struct.build_id zeroinitializer, align 4
+
+; Function Attrs: noinline nounwind
+define dso_local i32 @foo() #0 {
+; BPF-LABEL: define dso_local i32 @foo(
+; BPF-SAME: ) #[[ATTR0:[0-9]+]] {
+; BPF-NEXT:  entry:
+; BPF-NEXT:    [[ID1:%.*]] = alloca [[STRUCT_BUILD_ID:%.*]], align 4
+; BPF-NEXT:    call void @bar1(ptr noundef [[ID1]])
+; BPF-NEXT:    br label [[LOADBB:%.*]]
+; BPF:       res_block:
+; BPF-NEXT:    br label [[ENDBLOCK:%.*]]
+; BPF:       loadbb:
+; BPF-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ID1]], align 4
+; BPF-NEXT:    [[TMP1:%.*]] = load i64, ptr @id2, align 4
+; BPF-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP0]], [[TMP1]]
+; BPF-NEXT:    br i1 [[TMP2]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; BPF:       loadbb1:
+; BPF-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[ID1]], i64 8
+; BPF-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 4
+; BPF-NEXT:    [[TMP5:%.*]] = load i64, ptr getelementptr (i8, ptr @id2, i64 8), align 4
+; BPF-NEXT:    [[TMP6:%.*]] = icmp ne i64 [[TMP4]], [[TMP5]]
+; BPF-NEXT:    br i1 [[TMP6]], label [[RES_BLOCK]], label [[LOADBB2:%.*]]
+; BPF:       loadbb2:
+; BPF-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[ID1]], i64 16
+; BPF-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+; BPF-NEXT:    [[TMP9:%.*]] = load i32, ptr getelementptr (i8, ptr @id2, i64 16), align 4
+; BPF-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP8]], [[TMP9]]
+; BPF-NEXT:    br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; BPF:       endblock:
+; BPF-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ 1, [[RES_BLOCK]] ]
+; BPF-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; BPF-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; BPF-NEXT:    ret i32 [[CONV]]
+;
+; BPF-V3-LABEL: define dso_local i32 @foo(
+; BPF-V3-SAME: ) #[[ATTR0:[0-9]+]] {
+; BPF-V3-NEXT:  entry:
+; BPF-V3-NEXT:    [[ID1:%.*]] = alloca [[STRUCT_BUILD_ID:%.*]], align 4
+; BPF-V3-NEXT:    call void @bar1(ptr noundef [[ID1]])
+; BPF-V3-NEXT:    br label [[LOADBB:%.*]]
+; BPF-V3:       res_block:
+; BPF-V3-NEXT:    br label [[ENDBLOCK:%.*]]
+; BPF-V3:       loadbb:
+; BPF-V3-NEXT:    [[TMP0:%.*]] = load i64, ptr [[ID1]], align 4
+; BPF-V3-NEXT:    [[TMP1:%.*]] = load i64, ptr @id2, align 4
+; BPF-V3-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP0]], [[TMP1]]
+; BPF-V3-NEXT:    br i1 [[TMP2]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; BPF-V3:       loadbb1:
+; BPF-V3-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[ID1]], i64 8
+; BPF-V3-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 4
+; BPF-V3-NEXT:    [[TMP5:%.*]] = load i64, ptr getelementptr (i8, ptr @id2, i64 8), align 4
+; BPF-V3-NEXT:    [[TMP6:%.*]] = icmp ne i64 [[TMP4]], [[TMP5]]
+; BPF-V3-NEXT:    br i1 [[TMP6]], label [[RES_BLOCK]], label [[LOADBB2:%.*]]
+; BPF-V3:       loadbb2:
+; BPF-V3-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[ID1]], i64 16
+; BPF-V3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+; BPF-V3-NEXT:    [[TMP9:%.*]] = load i32, ptr getelementptr (i8, ptr @id2, i64 16), align 4
+; BPF-V3-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP8]], [[TMP9]]
+; BPF-V3-NEXT:    br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; BPF-V3:       endblock:
+; BPF-V3-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ 1, [[RES_BLOCK]] ]
+; BPF-V3-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; BPF-V3-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; BPF-V3-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %id1 = alloca %struct.build_id, align 4
+  call void @bar1(ptr noundef %id1)
+  %call = call i32 @memcmp(ptr noundef %id1, ptr noundef @id2, i64 noundef 20) #3
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+declare dso_local void @bar1(ptr noundef) #1
+
+; Function Attrs: nounwind
+declare dso_local i32 @memcmp(ptr noundef, ptr noundef, i64 noundef) #2
+
+attributes #0 = { noinline nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+attributes #1 = { "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+attributes #2 = { nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+attributes #3 = { nounwind }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"frame-pointer", i32 2}
+!2 = !{!"clang version 18.0.0git (git at github.com:llvm/llvm-project.git a776740d6296520b8bde156aa3f8d9ecb32cddd9)"}
diff --git a/llvm/test/Transforms/ExpandMemCmp/PowerPC/lit.local.cfg b/llvm/test/Transforms/ExpandMemCmp/PowerPC/lit.local.cfg
new file mode 100644
index 00000000000000..bb982488eb15ee
--- /dev/null
+++ b/llvm/test/Transforms/ExpandMemCmp/PowerPC/lit.local.cfg
@@ -0,0 +1,2 @@
+if not "PowerPC" in config.root.targets:
+    config.unsupported = True
diff --git a/llvm/test/Transforms/ExpandMemCmp/PowerPC/memCmpUsedInZeroEqualityComparison.ll b/llvm/test/Transforms/ExpandMemCmp/PowerPC/memCmpUsedInZeroEqualityComparison.ll
new file mode 100644
index 00000000000000..9a75b147e7e1fb
--- /dev/null
+++ b/llvm/test/Transforms/ExpandMemCmp/PowerPC/memCmpUsedInZeroEqualityComparison.ll
@@ -0,0 +1,218 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -passes=expand-memcmp -mcpu=pwr8 -mtriple=powerpc64le-unknown-gnu-linux < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+ at zeroEqualityTest01.buffer1 = private unnamed_addr constant [3 x i32] [i32 1, i32 2, i32 4], align 4
+ at zeroEqualityTest01.buffer2 = private unnamed_addr constant [3 x i32] [i32 1, i32 2, i32 3], align 4
+ at zeroEqualityTest02.buffer1 = private unnamed_addr constant [4 x i32] [i32 4, i32 0, i32 0, i32 0], align 4
+ at zeroEqualityTest02.buffer2 = private unnamed_addr constant [4 x i32] [i32 3, i32 0, i32 0, i32 0], align 4
+ at zeroEqualityTest03.buffer1 = private unnamed_addr constant [4 x i32] [i32 0, i32 0, i32 0, i32 3], align 4
+ at zeroEqualityTest03.buffer2 = private unnamed_addr constant [4 x i32] [i32 0, i32 0, i32 0, i32 4], align 4
+ at zeroEqualityTest04.buffer1 = private unnamed_addr constant [15 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14], align 4
+ at zeroEqualityTest04.buffer2 = private unnamed_addr constant [15 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 13], align 4
+
+declare signext i32 @memcmp(ptr nocapture, ptr nocapture, i64) local_unnamed_addr #1
+
+; Check 4 bytes - requires 1 load for each param.
+define signext i32 @zeroEqualityTest02(ptr %x, ptr %y) {
+; CHECK-LABEL: define signext i32 @zeroEqualityTest02(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; CHECK-NEXT:    [[DOT:%.*]] = zext i1 [[TMP3]] to i32
+; CHECK-NEXT:    ret i32 [[DOT]]
+;
+  %call = tail call signext i32 @memcmp(ptr %x, ptr %y, i64 4)
+  %not.cmp = icmp ne i32 %call, 0
+  %. = zext i1 %not.cmp to i32
+  ret i32 %.
+}
+
+; Check 16 bytes - requires 2 loads for each param (or use vectors?).
+define signext i32 @zeroEqualityTest01(ptr %x, ptr %y) {
+; CHECK-LABEL: define signext i32 @zeroEqualityTest01(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[NOT_TOBOOL:%.*]] = icmp ne i32 [[PHI_RES]], 0
+; CHECK-NEXT:    [[DOT:%.*]] = zext i1 [[NOT_TOBOOL]] to i32
+; CHECK-NEXT:    ret i32 [[DOT]]
+;
+  %call = tail call signext i32 @memcmp(ptr %x, ptr %y, i64 16)
+  %not.tobool = icmp ne i32 %call, 0
+  %. = zext i1 %not.tobool to i32
+  ret i32 %.
+}
+
+; Check 7 bytes - requires 3 loads for each param.
+define signext i32 @zeroEqualityTest03(ptr %x, ptr %y) {
+; CHECK-LABEL: define signext i32 @zeroEqualityTest03(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i16 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[LOADBB2:%.*]]
+; CHECK:       loadbb2:
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i8 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ 1, [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[NOT_LNOT:%.*]] = icmp ne i32 [[PHI_RES]], 0
+; CHECK-NEXT:    [[COND:%.*]] = zext i1 [[NOT_LNOT]] to i32
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %call = tail call signext i32 @memcmp(ptr %x, ptr %y, i64 7)
+  %not.lnot = icmp ne i32 %call, 0
+  %cond = zext i1 %not.lnot to i32
+  ret i32 %cond
+}
+
+; Validate with > 0
+define signext i32 @zeroEqualityTest04() {
+; CHECK-LABEL: define signext i32 @zeroEqualityTest04(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ 288230376151711744, [[LOADBB]] ], [ 0, [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 216172782113783808, [[LOADBB]] ], [ 0, [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    br i1 false, label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    br i1 true, label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[NOT_CMP:%.*]] = icmp slt i32 [[PHI_RES]], 1
+; CHECK-NEXT:    [[DOT:%.*]] = zext i1 [[NOT_CMP]] to i32
+; CHECK-NEXT:    ret i32 [[DOT]]
+;
+  %call = tail call signext i32 @memcmp(ptr @zeroEqualityTest02.buffer1, ptr @zeroEqualityTest02.buffer2, i64 16)
+  %not.cmp = icmp slt i32 %call, 1
+  %. = zext i1 %not.cmp to i32
+  ret i32 %.
+}
+
+; Validate with < 0
+define signext i32 @zeroEqualityTest05() {
+; CHECK-LABEL: define signext i32 @zeroEqualityTest05(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ 0, [[LOADBB]] ], [ 50331648, [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 0, [[LOADBB]] ], [ 67108864, [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    br i1 true, label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    br i1 false, label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CALL_LOBIT:%.*]] = lshr i32 [[PHI_RES]], 31
+; CHECK-NEXT:    [[CALL_LOBIT_NOT:%.*]] = xor i32 [[CALL_LOBIT]], 1
+; CHECK-NEXT:    ret i32 [[CALL_LOBIT_NOT]]
+;
+  %call = tail call signext i32 @memcmp(ptr @zeroEqualityTest03.buffer1, ptr @zeroEqualityTest03.buffer2, i64 16)
+  %call.lobit = lshr i32 %call, 31
+  %call.lobit.not = xor i32 %call.lobit, 1
+  ret i32 %call.lobit.not
+}
+
+; Validate with memcmp()?:
+define signext i32 @equalityFoldTwoConstants() {
+; CHECK-LABEL: define signext i32 @equalityFoldTwoConstants(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    br i1 false, label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    br i1 false, label [[RES_BLOCK]], label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[NOT_TOBOOL:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; CHECK-NEXT:    [[COND:%.*]] = zext i1 [[NOT_TOBOOL]] to i32
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %call = tail call signext i32 @memcmp(ptr @zeroEqualityTest04.buffer1, ptr @zeroEqualityTest04.buffer2, i64 16)
+  %not.tobool = icmp eq i32 %call, 0
+  %cond = zext i1 %not.tobool to i32
+  ret i32 %cond
+}
+
+define signext i32 @equalityFoldOneConstant(ptr %X) {
+; CHECK-LABEL: define signext i32 @equalityFoldOneConstant(
+; CHECK-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i64 4294967296, [[TMP1]]
+; CHECK-NEXT:    br i1 [[TMP2]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 12884901890, [[TMP4]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[NOT_TOBOOL:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; CHECK-NEXT:    [[COND:%.*]] = zext i1 [[NOT_TOBOOL]] to i32
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+  %call = tail call signext i32 @memcmp(ptr @zeroEqualityTest04.buffer1, ptr %X, i64 16)
+  %not.tobool = icmp eq i32 %call, 0
+  %cond = zext i1 %not.tobool to i32
+  ret i32 %cond
+}
+
+define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call signext i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR3:[0-9]+]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call signext i32 @memcmp(ptr %X, ptr %Y, i64 2) nobuiltin
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
diff --git a/llvm/test/Transforms/ExpandMemCmp/PowerPC/memcmp-mergeexpand.ll b/llvm/test/Transforms/ExpandMemCmp/PowerPC/memcmp-mergeexpand.ll
new file mode 100644
index 00000000000000..ffc49478cfa4d3
--- /dev/null
+++ b/llvm/test/Transforms/ExpandMemCmp/PowerPC/memcmp-mergeexpand.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -passes=expand-memcmp -mcpu=pwr8 -mtriple=powerpc64le-unknown-gnu-linux < %s | FileCheck %s
+
+; This tests interaction between MergeICmp and expand-memcmp.
+
+%"struct.std::pair" = type { i32, i32 }
+
+define zeroext i1 @opeq1(
+; CHECK-LABEL: define zeroext i1 @opeq1(
+; CHECK-SAME: ptr nocapture readonly dereferenceable(8) [[A:%.*]], ptr nocapture readonly dereferenceable(8) [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B]], align 4
+; CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[CMP_I]], label [[LAND_RHS_I:%.*]], label [[OPEQ1_EXIT:%.*]]
+; CHECK:       land.rhs.i:
+; CHECK-NEXT:    [[SECOND_I:%.*]] = getelementptr inbounds %"struct.std::pair", ptr [[A]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[SECOND_I]], align 4
+; CHECK-NEXT:    [[SECOND2_I:%.*]] = getelementptr inbounds %"struct.std::pair", ptr [[B]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[SECOND2_I]], align 4
+; CHECK-NEXT:    [[CMP3_I:%.*]] = icmp eq i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    br label [[OPEQ1_EXIT]]
+; CHECK:       opeq1.exit:
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[CMP3_I]], [[LAND_RHS_I]] ]
+; CHECK-NEXT:    ret i1 [[TMP4]]
+;
+  ptr nocapture readonly dereferenceable(8) %a,
+  ptr nocapture readonly dereferenceable(8) %b) local_unnamed_addr #0 {
+entry:
+  %0 = load i32, ptr %a, align 4
+  %1 = load i32, ptr %b, align 4
+  %cmp.i = icmp eq i32 %0, %1
+  br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit
+
+land.rhs.i:
+  %second.i = getelementptr inbounds %"struct.std::pair", ptr %a, i64 0, i32 1
+  %2 = load i32, ptr %second.i, align 4
+  %second2.i = getelementptr inbounds %"struct.std::pair", ptr %b, i64 0, i32 1
+  %3 = load i32, ptr %second2.i, align 4
+  %cmp3.i = icmp eq i32 %2, %3
+  br label %opeq1.exit
+
+opeq1.exit:
+  %4 = phi i1 [ false, %entry ], [ %cmp3.i, %land.rhs.i ]
+  ret i1 %4
+}
+
+
diff --git a/llvm/test/Transforms/ExpandMemCmp/PowerPC/memcmp.ll b/llvm/test/Transforms/ExpandMemCmp/PowerPC/memcmp.ll
new file mode 100644
index 00000000000000..21cdbd65544c4c
--- /dev/null
+++ b/llvm/test/Transforms/ExpandMemCmp/PowerPC/memcmp.ll
@@ -0,0 +1,70 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -passes=expand-memcmp -mcpu=pwr8 -mtriple=powerpc64le-unknown-gnu-linux < %s | FileCheck %s
+
+define signext i32 @memcmp8(ptr nocapture readonly %buffer1, ptr nocapture readonly %buffer2) {
+; CHECK-LABEL: define signext i32 @memcmp8(
+; CHECK-SAME: ptr nocapture readonly [[BUFFER1:%.*]], ptr nocapture readonly [[BUFFER2:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[BUFFER1]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[BUFFER2]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; CHECK-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    ret i32 [[TMP9]]
+;
+  %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 8)
+  ret i32 %call
+}
+
+define signext i32 @memcmp4(ptr nocapture readonly %buffer1, ptr nocapture readonly %buffer2) {
+; CHECK-LABEL: define signext i32 @memcmp4(
+; CHECK-SAME: ptr nocapture readonly [[BUFFER1:%.*]], ptr nocapture readonly [[BUFFER2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[BUFFER1]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[BUFFER2]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; CHECK-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    ret i32 [[TMP9]]
+;
+  %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 4)
+  ret i32 %call
+}
+
+define signext i32 @memcmp2(ptr nocapture readonly %buffer1, ptr nocapture readonly %buffer2) {
+; CHECK-LABEL: define signext i32 @memcmp2(
+; CHECK-SAME: ptr nocapture readonly [[BUFFER1:%.*]], ptr nocapture readonly [[BUFFER2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[BUFFER1]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[BUFFER2]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    ret i32 [[TMP7]]
+;
+  %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 2)
+  ret i32 %call
+}
+
+define signext i32 @memcmp1(ptr nocapture readonly %buffer1, ptr nocapture readonly %buffer2) {
+; CHECK-LABEL: define signext i32 @memcmp1(
+; CHECK-SAME: ptr nocapture readonly [[BUFFER1:%.*]], ptr nocapture readonly [[BUFFER2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[BUFFER1]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[BUFFER2]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP2]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = sub i32 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    ret i32 [[TMP5]]
+;
+  %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 1) #2
+  ret i32 %call
+}
+
+declare signext i32 @memcmp(ptr, ptr, i64)
diff --git a/llvm/test/Transforms/ExpandMemCmp/PowerPC/memcmpIR.ll b/llvm/test/Transforms/ExpandMemCmp/PowerPC/memcmpIR.ll
new file mode 100644
index 00000000000000..3ad0c9d12ea0bc
--- /dev/null
+++ b/llvm/test/Transforms/ExpandMemCmp/PowerPC/memcmpIR.ll
@@ -0,0 +1,216 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -passes=expand-memcmp -mcpu=pwr8 -mtriple=powerpc64le-unknown-gnu-linux < %s | FileCheck %s
+
+define signext i32 @test1(ptr nocapture readonly %buffer1, ptr nocapture readonly %buffer2)  {
+; CHECK-LABEL: define signext i32 @test1(
+; CHECK-SAME: ptr nocapture readonly [[BUFFER1:%.*]], ptr nocapture readonly [[BUFFER2:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP11:%.*]], [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[BUFFER1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[BUFFER2]], align 1
+; CHECK-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[BUFFER1]], i64 8
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[BUFFER2]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11]] = call i64 @llvm.bswap.i64(i64 [[TMP9]])
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP1]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+entry:
+
+
+
+  ; CHECK-BE-LABEL: @test1(
+  ; CHECK-BE-LABEL: res_block:{{.*}}
+  ; CHECK-BE: [[ICMP2:%[0-9]+]] = icmp ult i64
+  ; CHECK-BE-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
+  ; CHECK-BE-NEXT: br label %endblock
+
+  ; CHECK-BE-LABEL: loadbb:{{.*}}
+  ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i64, ptr
+  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, ptr
+  ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[LOAD1]], [[LOAD2]]
+  ; CHECK-BE-NEXT:  br i1 [[ICMP]], label %loadbb1, label %res_block
+
+  ; CHECK-BE-LABEL: loadbb1:{{.*}}
+  ; CHECK-BE-NEXT: [[GEP1:%[0-9]+]] = getelementptr i8, ptr {{.*}}, i64 8
+  ; CHECK-BE-NEXT: [[GEP2:%[0-9]+]] = getelementptr i8, ptr {{.*}}, i64 8
+  ; CHECK-BE-NEXT: [[LOAD1:%[0-9]+]] = load i64, ptr [[GEP1]]
+  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, ptr [[GEP2]]
+  ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[LOAD1]], [[LOAD2]]
+  ; CHECK-BE-NEXT:  br i1 [[ICMP]], label %endblock, label %res_block
+
+  %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 16)
+  ret i32 %call
+}
+
+declare signext i32 @memcmp(ptr nocapture, ptr nocapture, i64) local_unnamed_addr #1
+
+define signext i32 @test2(ptr nocapture readonly %buffer1, ptr nocapture readonly %buffer2)  {
+; CHECK-LABEL: define signext i32 @test2(
+; CHECK-SAME: ptr nocapture readonly [[BUFFER1:%.*]], ptr nocapture readonly [[BUFFER2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[BUFFER1]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[BUFFER2]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP0]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ugt i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i1 [[TMP4]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; CHECK-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    ret i32 [[TMP8]]
+;
+
+  ; CHECK-BE-LABEL: @test2(
+  ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i32, ptr
+  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i32, ptr
+  ; CHECK-BE-NEXT: [[CMP1:%[0-9]+]] = icmp ugt i32 [[LOAD1]], [[LOAD2]]
+  ; CHECK-BE-NEXT: [[CMP2:%[0-9]+]] = icmp ult i32 [[LOAD1]], [[LOAD2]]
+  ; CHECK-BE-NEXT: [[Z1:%[0-9]+]] = zext i1 [[CMP1]] to i32
+  ; CHECK-BE-NEXT: [[Z2:%[0-9]+]] = zext i1 [[CMP2]] to i32
+  ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i32 [[Z1]], [[Z2]]
+  ; CHECK-BE-NEXT: ret i32 [[SUB]]
+
+entry:
+  %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 4)
+  ret i32 %call
+}
+
+define signext i32 @test3(ptr nocapture readonly %buffer1, ptr nocapture readonly %buffer2)  {
+; CHECK-LABEL: define signext i32 @test3(
+; CHECK-SAME: ptr nocapture readonly [[BUFFER1:%.*]], ptr nocapture readonly [[BUFFER2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1:%.*]] ], [ [[TMP22:%.*]], [[LOADBB2:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1]] ], [ [[TMP23:%.*]], [[LOADBB2]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[BUFFER1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[BUFFER2]], align 1
+; CHECK-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[BUFFER1]], i64 8
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[BUFFER2]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP9]])
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = zext i32 [[TMP11]] to i64
+; CHECK-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    br i1 [[TMP15]], label [[LOADBB2]], label [[RES_BLOCK]]
+; CHECK:       loadbb2:
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[BUFFER1]], i64 12
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[BUFFER2]], i64 12
+; CHECK-NEXT:    [[TMP18:%.*]] = load i16, ptr [[TMP16]], align 1
+; CHECK-NEXT:    [[TMP19:%.*]] = load i16, ptr [[TMP17]], align 1
+; CHECK-NEXT:    [[TMP20:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP18]])
+; CHECK-NEXT:    [[TMP21:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP19]])
+; CHECK-NEXT:    [[TMP22]] = zext i16 [[TMP20]] to i64
+; CHECK-NEXT:    [[TMP23]] = zext i16 [[TMP21]] to i64
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[TMP22]], [[TMP23]]
+; CHECK-NEXT:    br i1 [[TMP24]], label [[LOADBB3:%.*]], label [[RES_BLOCK]]
+; CHECK:       loadbb3:
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[BUFFER1]], i64 14
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[BUFFER2]], i64 14
+; CHECK-NEXT:    [[TMP27:%.*]] = load i8, ptr [[TMP25]], align 1
+; CHECK-NEXT:    [[TMP28:%.*]] = load i8, ptr [[TMP26]], align 1
+; CHECK-NEXT:    [[TMP29:%.*]] = zext i8 [[TMP27]] to i32
+; CHECK-NEXT:    [[TMP30:%.*]] = zext i8 [[TMP28]] to i32
+; CHECK-NEXT:    [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP30]]
+; CHECK-NEXT:    br label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP31]], [[LOADBB3]] ], [ [[TMP1]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+
+
+
+
+
+  ; CHECK-BE-LABEL: res_block:{{.*}}
+  ; CHECK-BE: [[ICMP2:%[0-9]+]] = icmp ult i64
+  ; CHECK-BE-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
+  ; CHECK-BE-NEXT: br label %endblock
+
+  ; CHECK-BE-LABEL: loadbb:{{.*}}
+  ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i64, ptr
+  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, ptr
+  ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[LOAD1]], [[LOAD2]]
+  ; CHECK-BE-NEXT:  br i1 [[ICMP]], label %loadbb1, label %res_block
+
+  ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i32, ptr
+  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i32, ptr
+  ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[LOAD1]] to i64
+  ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[LOAD2]] to i64
+  ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[ZEXT1]], [[ZEXT2]]
+  ; CHECK-BE-NEXT:  br i1 [[ICMP]], label %loadbb2, label %res_block
+
+  ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i16, ptr
+  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i16, ptr
+  ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i16 [[LOAD1]] to i64
+  ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i16 [[LOAD2]] to i64
+  ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp eq i64 [[ZEXT1]], [[ZEXT2]]
+  ; CHECK-BE-NEXT:  br i1 [[ICMP]], label %loadbb3, label %res_block
+
+  ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i8, ptr
+  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i8, ptr
+  ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i8 [[LOAD1]] to i32
+  ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i8 [[LOAD2]] to i32
+  ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i32 [[ZEXT1]], [[ZEXT2]]
+  ; CHECK-BE-NEXT:  br label %endblock
+
+entry:
+  %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 15)
+  ret i32 %call
+}
+  ; CHECK-BE: call = tail call signext i32 @memcmp
+define signext i32 @test4(ptr nocapture readonly %buffer1, ptr nocapture readonly %buffer2)  {
+; CHECK-LABEL: define signext i32 @test4(
+; CHECK-SAME: ptr nocapture readonly [[BUFFER1:%.*]], ptr nocapture readonly [[BUFFER2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = tail call signext i32 @memcmp(ptr [[BUFFER1]], ptr [[BUFFER2]], i64 65)
+; CHECK-NEXT:    ret i32 [[CALL]]
+;
+entry:
+  %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 65)
+  ret i32 %call
+}
+
+define signext i32 @test5(ptr nocapture readonly %buffer1, ptr nocapture readonly %buffer2, i32 signext %SIZE)  {
+; CHECK-LABEL: define signext i32 @test5(
+; CHECK-SAME: ptr nocapture readonly [[BUFFER1:%.*]], ptr nocapture readonly [[BUFFER2:%.*]], i32 signext [[SIZE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[SIZE]] to i64
+; CHECK-NEXT:    [[CALL:%.*]] = tail call signext i32 @memcmp(ptr [[BUFFER1]], ptr [[BUFFER2]], i64 [[CONV]])
+; CHECK-NEXT:    ret i32 [[CALL]]
+;
+  ; CHECK-BE: call = tail call signext i32 @memcmp
+entry:
+  %conv = sext i32 %SIZE to i64
+  %call = tail call signext i32 @memcmp(ptr %buffer1, ptr %buffer2, i64 %conv)
+  ret i32 %call
+}
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/bcmp.ll b/llvm/test/Transforms/ExpandMemCmp/X86/bcmp.ll
index 41d357728b93e7..5877d00a818c5f 100644
--- a/llvm/test/Transforms/ExpandMemCmp/X86/bcmp.ll
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/bcmp.ll
@@ -1,16 +1,16 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -expand-memcmp -memcmp-num-loads-per-block=1 -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128         < %s | FileCheck %s --check-prefix=X64
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt -S -passes=expand-memcmp -memcmp-num-loads-per-block=1 -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128         < %s | FileCheck %s --check-prefix=X64
 
 declare i32 @bcmp(ptr nocapture, ptr nocapture, i64)
 
 define i32 @bcmp8(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64-LABEL: @bcmp8(
-; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
-; X64-NEXT:    [[TMP6:%.*]] = zext i1 [[TMP5]] to i32
-; X64-NEXT:    ret i32 [[TMP6]]
+; X64-LABEL: define i32 @bcmp8(
+; X64-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    ret i32 [[TMP4]]
 ;
   %call = tail call i32 @bcmp(ptr %x, ptr %y, i64 8)
   ret i32 %call
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-2.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-2.ll
new file mode 100644
index 00000000000000..4424488a7fffb1
--- /dev/null
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-2.ll
@@ -0,0 +1,20249 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4  -mtriple=x86_64-unknown-unknown               < %s | FileCheck %s --check-prefixes=X64
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 -mtriple=x86_64-unknown-unknown -mattr=sse4.1 < %s | FileCheck %s --check-prefixes=X64-SSE41
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 -mtriple=x86_64-unknown-unknown -mattr=avx    < %s | FileCheck %s --check-prefixes=X64-AVX1
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 -mtriple=x86_64-unknown-unknown -mattr=avx2   < %s | FileCheck %s --check-prefixes=X64-AVX2
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 -mtriple=x86_64-unknown-unknown -mattr=avx512bw,+prefer-256-bit < %s | FileCheck %s --check-prefixes=X64-AVX512BW-256
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 -mtriple=x86_64-unknown-unknown -mattr=avx512bw,-prefer-256-bit < %s | FileCheck %s --check-prefixes=X64-AVX512BW
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 -mtriple=x86_64-unknown-unknown -mattr=avx512f,+prefer-256-bit,-prefer-mask-registers < %s | FileCheck %s --check-prefixes=X64-AVX512F-256
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 -mtriple=x86_64-unknown-unknown -mattr=avx512f,-prefer-256-bit,-prefer-mask-registers < %s | FileCheck %s --check-prefixes=X64-AVX512F
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 -mtriple=x86_64-unknown-unknown -mattr=avx512f,+prefer-256-bit,+prefer-mask-registers < %s | FileCheck %s --check-prefixes=X64-MIC-AVX2
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 -mtriple=x86_64-unknown-unknown -mattr=avx512f,-prefer-256-bit,+prefer-mask-registers < %s | FileCheck %s --check-prefixes=X64-MIC-AVX512F
+
+; This tests codegen time inlining/optimization of memcmp
+; rdar://6480398
+
+ at .str = private constant [513 x i8] c"01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901\00", align 1
+
+declare dso_local i32 @memcmp(ptr, ptr, i64)
+
+define i32 @length0(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length0(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
+; X64-NEXT:    ret i32 0
+;
+; X64-SSE41-LABEL: define i32 @length0(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-SSE41-NEXT:    ret i32 0
+;
+; X64-AVX1-LABEL: define i32 @length0(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-AVX1-NEXT:    ret i32 0
+;
+; X64-AVX2-LABEL: define i32 @length0(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-AVX2-NEXT:    ret i32 0
+;
+; X64-AVX512BW-256-LABEL: define i32 @length0(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-AVX512BW-256-NEXT:    ret i32 0
+;
+; X64-AVX512BW-LABEL: define i32 @length0(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-AVX512BW-NEXT:    ret i32 0
+;
+; X64-AVX512F-256-LABEL: define i32 @length0(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-AVX512F-256-NEXT:    ret i32 0
+;
+; X64-AVX512F-LABEL: define i32 @length0(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-AVX512F-NEXT:    ret i32 0
+;
+; X64-MIC-AVX2-LABEL: define i32 @length0(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-MIC-AVX2-NEXT:    ret i32 0
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length0(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-MIC-AVX512F-NEXT:    ret i32 0
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
+  ret i32 %m
+  }
+
+define i1 @length0_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length0_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    ret i1 true
+;
+; X64-SSE41-LABEL: define i1 @length0_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    ret i1 true
+;
+; X64-AVX1-LABEL: define i1 @length0_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    ret i1 true
+;
+; X64-AVX2-LABEL: define i1 @length0_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    ret i1 true
+;
+; X64-AVX512BW-256-LABEL: define i1 @length0_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    ret i1 true
+;
+; X64-AVX512BW-LABEL: define i1 @length0_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    ret i1 true
+;
+; X64-AVX512F-256-LABEL: define i1 @length0_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    ret i1 true
+;
+; X64-AVX512F-LABEL: define i1 @length0_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    ret i1 true
+;
+; X64-MIC-AVX2-LABEL: define i1 @length0_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    ret i1 true
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length0_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    ret i1 true
+;
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length0_lt(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length0_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    ret i1 false
+;
+; X64-SSE41-LABEL: define i1 @length0_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    ret i1 false
+;
+; X64-AVX1-LABEL: define i1 @length0_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    ret i1 false
+;
+; X64-AVX2-LABEL: define i1 @length0_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    ret i1 false
+;
+; X64-AVX512BW-256-LABEL: define i1 @length0_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    ret i1 false
+;
+; X64-AVX512BW-LABEL: define i1 @length0_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    ret i1 false
+;
+; X64-AVX512F-256-LABEL: define i1 @length0_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    ret i1 false
+;
+; X64-AVX512F-LABEL: define i1 @length0_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    ret i1 false
+;
+; X64-MIC-AVX2-LABEL: define i1 @length0_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    ret i1 false
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length0_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    ret i1 false
+;
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length2(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length2(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    ret i32 [[TMP7]]
+;
+; X64-SSE41-LABEL: define i32 @length2(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    ret i32 [[TMP7]]
+;
+; X64-AVX1-LABEL: define i32 @length2(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    ret i32 [[TMP7]]
+;
+; X64-AVX2-LABEL: define i32 @length2(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    ret i32 [[TMP7]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length2(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[TMP7]]
+;
+; X64-AVX512BW-LABEL: define i32 @length2(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    ret i32 [[TMP7]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length2(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    ret i32 [[TMP7]]
+;
+; X64-AVX512F-LABEL: define i32 @length2(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    ret i32 [[TMP7]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length2(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[TMP7]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length2(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[TMP7]]
+;
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+  ret i32 %m
+}
+
+define i32 @length2_const(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length2_const(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X64-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X64-NEXT:    ret i32 [[TMP4]]
+;
+; X64-SSE41-LABEL: define i32 @length2_const(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X64-SSE41-NEXT:    ret i32 [[TMP4]]
+;
+; X64-AVX1-LABEL: define i32 @length2_const(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X64-AVX1-NEXT:    ret i32 [[TMP4]]
+;
+; X64-AVX2-LABEL: define i32 @length2_const(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X64-AVX2-NEXT:    ret i32 [[TMP4]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length2_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X64-AVX512BW-256-NEXT:    ret i32 [[TMP4]]
+;
+; X64-AVX512BW-LABEL: define i32 @length2_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X64-AVX512BW-NEXT:    ret i32 [[TMP4]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length2_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X64-AVX512F-256-NEXT:    ret i32 [[TMP4]]
+;
+; X64-AVX512F-LABEL: define i32 @length2_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X64-AVX512F-NEXT:    ret i32 [[TMP4]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length2_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X64-MIC-AVX2-NEXT:    ret i32 [[TMP4]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length2_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X64-MIC-AVX512F-NEXT:    ret i32 [[TMP4]]
+;
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
+  ret i32 %m
+}
+
+define i1 @length2_gt_const(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length2_gt_const(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X64-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X64-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP4]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length2_gt_const(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP4]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length2_gt_const(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP4]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length2_gt_const(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP4]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length2_gt_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP4]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length2_gt_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP4]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length2_gt_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP4]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length2_gt_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP4]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length2_gt_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP4]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length2_gt_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP4]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length2_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length2_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length2_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length2_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length2_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length2_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length2_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length2_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length2_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length2_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length2_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length2_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length2_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length2_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length2_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length2_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length2_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length2_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length2_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length2_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_gt(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length2_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length2_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length2_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length2_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length2_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length2_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length2_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length2_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length2_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length2_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_const(ptr %X) nounwind {
+; X64-LABEL: define i1 @length2_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-NEXT:    ret i1 [[TMP2]]
+;
+; X64-SSE41-LABEL: define i1 @length2_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX1-LABEL: define i1 @length2_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX2-LABEL: define i1 @length2_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length2_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512BW-LABEL: define i1 @length2_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length2_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512F-LABEL: define i1 @length2_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP2]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length2_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP2]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length2_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP2]]
+;
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR3:[0-9]+]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR4:[0-9]+]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR4:[0-9]+]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR4:[0-9]+]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR4:[0-9]+]]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR4:[0-9]+]]
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR4:[0-9]+]]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR4:[0-9]+]]
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR4:[0-9]+]]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR4:[0-9]+]]
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind nobuiltin
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length3(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length3(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-SSE41-LABEL: define i32 @length3(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br label [[ENDBLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length3(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length3(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length3(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-LABEL: define i32 @length3(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length3(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-LABEL: define i32 @length3(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length3(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length3(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
+  ret i32 %m
+}
+
+define i1 @length3_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length3_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-NEXT:    ret i1 [[TMP12]]
+;
+; X64-SSE41-LABEL: define i1 @length3_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX1-LABEL: define i1 @length3_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX2-LABEL: define i1 @length3_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length3_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512BW-LABEL: define i1 @length3_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length3_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512F-LABEL: define i1 @length3_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP12]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length3_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP12]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length3_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP12]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length4(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length4(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-NEXT:    ret i32 [[TMP9]]
+;
+; X64-SSE41-LABEL: define i32 @length4(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-SSE41-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX1-LABEL: define i32 @length4(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX1-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX2-LABEL: define i32 @length4(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX2-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length4(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX512BW-LABEL: define i32 @length4(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX512BW-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length4(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX512F-LABEL: define i32 @length4(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX512F-NEXT:    ret i32 [[TMP9]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length4(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[TMP9]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length4(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[TMP9]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  ret i32 %m
+}
+
+define i1 @length4_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length4_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    ret i1 [[TMP3]]
+;
+; X64-SSE41-LABEL: define i1 @length4_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX1-LABEL: define i1 @length4_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX2-LABEL: define i1 @length4_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length4_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX512BW-LABEL: define i1 @length4_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length4_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX512F-LABEL: define i1 @length4_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP3]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length4_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP3]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length4_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP3]]
+;
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_lt(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length4_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-NEXT:    ret i1 [[TMP5]]
+;
+; X64-SSE41-LABEL: define i1 @length4_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-SSE41-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX1-LABEL: define i1 @length4_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX1-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX2-LABEL: define i1 @length4_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX2-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length4_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX512BW-LABEL: define i1 @length4_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length4_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX512F-LABEL: define i1 @length4_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX512F-NEXT:    ret i1 [[TMP5]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length4_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP5]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length4_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP5]]
+;
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_gt(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length4_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-NEXT:    ret i1 [[TMP5]]
+;
+; X64-SSE41-LABEL: define i1 @length4_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-SSE41-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX1-LABEL: define i1 @length4_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX1-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX2-LABEL: define i1 @length4_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX2-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length4_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX512BW-LABEL: define i1 @length4_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length4_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX512F-LABEL: define i1 @length4_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX512F-NEXT:    ret i1 [[TMP5]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length4_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP5]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length4_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP5]]
+;
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_eq_const(ptr %X) nounwind {
+; X64-LABEL: define i1 @length4_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length4_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length4_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length4_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length4_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length4_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length4_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length4_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length4_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length4_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 4) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length5(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length5(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-SSE41-LABEL: define i32 @length5(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br label [[ENDBLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length5(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length5(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length5(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-LABEL: define i32 @length5(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length5(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-LABEL: define i32 @length5(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length5(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length5(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
+  ret i32 %m
+}
+
+define i1 @length5_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length5_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-NEXT:    ret i1 [[TMP12]]
+;
+; X64-SSE41-LABEL: define i1 @length5_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX1-LABEL: define i1 @length5_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX2-LABEL: define i1 @length5_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length5_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512BW-LABEL: define i1 @length5_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length5_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512F-LABEL: define i1 @length5_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP12]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length5_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP12]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length5_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP12]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length5_lt(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length5_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length5_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br label [[ENDBLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length5_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length5_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length5_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length5_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length5_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length5_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length5_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length5_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length7(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length7(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-SSE41-LABEL: define i32 @length7(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length7(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length7(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length7(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-LABEL: define i32 @length7(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length7(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-LABEL: define i32 @length7(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length7(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length7(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
+  ret i32 %m
+}
+
+define i1 @length7_lt(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length7_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length7_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length7_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length7_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length7_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length7_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length7_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length7_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length7_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length7_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length7_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length7_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X64-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-NEXT:    ret i1 [[TMP10]]
+;
+; X64-SSE41-LABEL: define i1 @length7_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX1-LABEL: define i1 @length7_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX2-LABEL: define i1 @length7_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length7_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512BW-LABEL: define i1 @length7_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length7_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512F-LABEL: define i1 @length7_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP10]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length7_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP10]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length7_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP10]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length8(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length8(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-NEXT:    ret i32 [[TMP9]]
+;
+; X64-SSE41-LABEL: define i32 @length8(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-SSE41-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX1-LABEL: define i32 @length8(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX1-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX2-LABEL: define i32 @length8(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX2-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length8(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX512BW-LABEL: define i32 @length8(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX512BW-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length8(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX512F-LABEL: define i32 @length8(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX512F-NEXT:    ret i32 [[TMP9]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length8(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[TMP9]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length8(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[TMP9]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
+  ret i32 %m
+}
+
+define i1 @length8_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length8_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length8_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length8_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length8_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length8_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length8_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length8_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length8_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length8_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length8_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length8_eq_const(ptr %X) nounwind {
+; X64-LABEL: define i1 @length8_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-NEXT:    ret i1 [[TMP2]]
+;
+; X64-SSE41-LABEL: define i1 @length8_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX1-LABEL: define i1 @length8_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX2-LABEL: define i1 @length8_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length8_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512BW-LABEL: define i1 @length8_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length8_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512F-LABEL: define i1 @length8_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP2]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length8_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP2]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length8_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP2]]
+;
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 8) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length9_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length9_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; X64-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; X64-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length9_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length9_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length9_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length9_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length9_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length9_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length9_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length9_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length9_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length10_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length10_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; X64-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; X64-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length10_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length10_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length10_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length10_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length10_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length10_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length10_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length10_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length10_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 10) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length11_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length11_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length11_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length11_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length11_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length11_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length11_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length11_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length11_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length11_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length11_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 11) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length12_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length12_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-NEXT:    ret i1 [[TMP12]]
+;
+; X64-SSE41-LABEL: define i1 @length12_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX1-LABEL: define i1 @length12_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX2-LABEL: define i1 @length12_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length12_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512BW-LABEL: define i1 @length12_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length12_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512F-LABEL: define i1 @length12_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP12]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length12_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP12]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length12_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP12]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length12(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length12(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-SSE41-LABEL: define i32 @length12(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-SSE41-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-SSE41-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length12(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-AVX1-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-AVX1-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length12(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-AVX2-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-AVX2-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length12(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-AVX512BW-256-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-LABEL: define i32 @length12(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-AVX512BW-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length12(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-AVX512F-256-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-LABEL: define i32 @length12(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-AVX512F-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-AVX512F-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length12(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-MIC-AVX2-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length12(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-MIC-AVX512F-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
+  ret i32 %m
+}
+
+define i1 @length13_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length13_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length13_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length13_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length13_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length13_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length13_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length13_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length13_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length13_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length13_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 13) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length14_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length14_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length14_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length14_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length14_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length14_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length14_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length14_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length14_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length14_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length14_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 14) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length15(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length15(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-SSE41-LABEL: define i32 @length15(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length15(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length15(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length15(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-LABEL: define i32 @length15(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length15(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-LABEL: define i32 @length15(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length15(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length15(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) nounwind
+  ret i32 %m
+}
+
+define i1 @length15_lt(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length15_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length15_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length15_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length15_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length15_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length15_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length15_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length15_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length15_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length15_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length15_const(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length15_const(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP8:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 3544952156018063160, [[LOADBB]] ], [ 4051322327650219061, [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 3544952156018063160
+; X64-NEXT:    br i1 [[TMP5]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; X64-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP7]])
+; X64-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 4051322327650219061
+; X64-NEXT:    br i1 [[TMP9]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-SSE41-LABEL: define i32 @length15_const(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP8:%.*]], [[LOADBB1:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 3544952156018063160, [[LOADBB]] ], [ 4051322327650219061, [[LOADBB1]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 3544952156018063160
+; X64-SSE41-NEXT:    br i1 [[TMP5]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; X64-SSE41-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP7]])
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 4051322327650219061
+; X64-SSE41-NEXT:    br i1 [[TMP9]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length15_const(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP8:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 3544952156018063160, [[LOADBB]] ], [ 4051322327650219061, [[LOADBB1]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 3544952156018063160
+; X64-AVX1-NEXT:    br i1 [[TMP5]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; X64-AVX1-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP7]])
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 4051322327650219061
+; X64-AVX1-NEXT:    br i1 [[TMP9]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length15_const(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP8:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 3544952156018063160, [[LOADBB]] ], [ 4051322327650219061, [[LOADBB1]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 3544952156018063160
+; X64-AVX2-NEXT:    br i1 [[TMP5]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; X64-AVX2-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP7]])
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 4051322327650219061
+; X64-AVX2-NEXT:    br i1 [[TMP9]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length15_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP8:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 3544952156018063160, [[LOADBB]] ], [ 4051322327650219061, [[LOADBB1]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 3544952156018063160
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP5]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP7]])
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 4051322327650219061
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP9]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-LABEL: define i32 @length15_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP8:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 3544952156018063160, [[LOADBB]] ], [ 4051322327650219061, [[LOADBB1]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 3544952156018063160
+; X64-AVX512BW-NEXT:    br i1 [[TMP5]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP7]])
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 4051322327650219061
+; X64-AVX512BW-NEXT:    br i1 [[TMP9]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length15_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP8:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 3544952156018063160, [[LOADBB]] ], [ 4051322327650219061, [[LOADBB1]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 3544952156018063160
+; X64-AVX512F-256-NEXT:    br i1 [[TMP5]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP7]])
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 4051322327650219061
+; X64-AVX512F-256-NEXT:    br i1 [[TMP9]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-LABEL: define i32 @length15_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP8:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 3544952156018063160, [[LOADBB]] ], [ 4051322327650219061, [[LOADBB1]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 3544952156018063160
+; X64-AVX512F-NEXT:    br i1 [[TMP5]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; X64-AVX512F-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP7]])
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 4051322327650219061
+; X64-AVX512F-NEXT:    br i1 [[TMP9]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length15_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP8:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 3544952156018063160, [[LOADBB]] ], [ 4051322327650219061, [[LOADBB1]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 3544952156018063160
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP5]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP7]])
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 4051322327650219061
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP9]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length15_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP8:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 3544952156018063160, [[LOADBB]] ], [ 4051322327650219061, [[LOADBB1]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 3544952156018063160
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP5]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP7]])
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 4051322327650219061
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP9]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 15) nounwind
+  ret i32 %m
+}
+
+define i1 @length15_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length15_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length15_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length15_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length15_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length15_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length15_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length15_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length15_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length15_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length15_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length15_gt_const(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length15_gt_const(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP8:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 3544952156018063160, [[LOADBB]] ], [ 4051322327650219061, [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 3544952156018063160
+; X64-NEXT:    br i1 [[TMP5]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; X64-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP7]])
+; X64-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 4051322327650219061
+; X64-NEXT:    br i1 [[TMP9]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[C:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length15_gt_const(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP8:%.*]], [[LOADBB1:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 3544952156018063160, [[LOADBB]] ], [ 4051322327650219061, [[LOADBB1]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 3544952156018063160
+; X64-SSE41-NEXT:    br i1 [[TMP5]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; X64-SSE41-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP7]])
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 4051322327650219061
+; X64-SSE41-NEXT:    br i1 [[TMP9]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length15_gt_const(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP8:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 3544952156018063160, [[LOADBB]] ], [ 4051322327650219061, [[LOADBB1]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 3544952156018063160
+; X64-AVX1-NEXT:    br i1 [[TMP5]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; X64-AVX1-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP7]])
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 4051322327650219061
+; X64-AVX1-NEXT:    br i1 [[TMP9]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length15_gt_const(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP8:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 3544952156018063160, [[LOADBB]] ], [ 4051322327650219061, [[LOADBB1]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 3544952156018063160
+; X64-AVX2-NEXT:    br i1 [[TMP5]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; X64-AVX2-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP7]])
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 4051322327650219061
+; X64-AVX2-NEXT:    br i1 [[TMP9]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length15_gt_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP8:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 3544952156018063160, [[LOADBB]] ], [ 4051322327650219061, [[LOADBB1]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 3544952156018063160
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP5]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP7]])
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 4051322327650219061
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP9]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length15_gt_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP8:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 3544952156018063160, [[LOADBB]] ], [ 4051322327650219061, [[LOADBB1]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 3544952156018063160
+; X64-AVX512BW-NEXT:    br i1 [[TMP5]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP7]])
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 4051322327650219061
+; X64-AVX512BW-NEXT:    br i1 [[TMP9]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length15_gt_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP8:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 3544952156018063160, [[LOADBB]] ], [ 4051322327650219061, [[LOADBB1]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 3544952156018063160
+; X64-AVX512F-256-NEXT:    br i1 [[TMP5]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP7]])
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 4051322327650219061
+; X64-AVX512F-256-NEXT:    br i1 [[TMP9]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length15_gt_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP8:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 3544952156018063160, [[LOADBB]] ], [ 4051322327650219061, [[LOADBB1]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 3544952156018063160
+; X64-AVX512F-NEXT:    br i1 [[TMP5]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; X64-AVX512F-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP7]])
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 4051322327650219061
+; X64-AVX512F-NEXT:    br i1 [[TMP9]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length15_gt_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP8:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 3544952156018063160, [[LOADBB]] ], [ 4051322327650219061, [[LOADBB1]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 3544952156018063160
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP5]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP7]])
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 4051322327650219061
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP9]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length15_gt_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP4:%.*]], [[LOADBB]] ], [ [[TMP8:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ 3544952156018063160, [[LOADBB]] ], [ 4051322327650219061, [[LOADBB1]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 3544952156018063160
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP5]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP6]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP7]])
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP8]], 4051322327650219061
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP9]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 15) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
+
+define i32 @length16(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length16(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-SSE41-LABEL: define i32 @length16(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length16(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length16(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length16(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-LABEL: define i32 @length16(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length16(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-LABEL: define i32 @length16(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length16(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length16(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 16) nounwind
+  ret i32 %m
+}
+
+define i1 @length16_eq(ptr %x, ptr %y) nounwind {
+;
+; X64-LABEL: define i1 @length16_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    ret i1 [[TMP3]]
+;
+; X64-SSE41-LABEL: define i1 @length16_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX1-LABEL: define i1 @length16_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX2-LABEL: define i1 @length16_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length16_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX512BW-LABEL: define i1 @length16_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length16_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX512F-LABEL: define i1 @length16_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP3]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length16_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP3]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length16_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX-LABEL: length16_eq:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    setne %al
+; X64-AVX-NEXT:    retq
+; X64-MIC-AVX-LABEL: length16_eq:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %xmm1
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k0
+; X64-MIC-AVX-NEXT:    setne %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length16_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length16_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length16_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length16_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length16_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length16_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length16_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length16_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length16_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length16_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length16_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length16_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length16_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length16_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length16_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length16_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length16_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length16_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length16_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length16_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_eq_const(ptr %X) nounwind {
+;
+; X64-LABEL: define i1 @length16_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length16_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length16_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length16_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length16_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length16_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length16_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length16_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length16_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length16_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-AVX-LABEL: length16_eq_const:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    sete %al
+; X64-AVX-NEXT:    retq
+; X64-MIC-AVX-LABEL: length16_eq_const:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [858927408,926299444,825243960,892613426]
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k0
+; X64-MIC-AVX-NEXT:    sete %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 16) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
+
+define i32 @length24(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length24(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-SSE41-LABEL: define i32 @length24(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb2:
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-SSE41-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-SSE41-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-SSE41-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-SSE41-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-SSE41-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length24(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb2:
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX1-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX1-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX1-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length24(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb2:
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX2-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length24(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb2:
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-LABEL: define i32 @length24(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb2:
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length24(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb2:
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-LABEL: define i32 @length24(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb2:
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length24(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb2:
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length24(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb2:
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 24) nounwind
+  ret i32 %m
+}
+
+define i1 @length24_eq(ptr %x, ptr %y) nounwind {
+;
+; X64-LABEL: define i1 @length24_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length24_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length24_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length24_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length24_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length24_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length24_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length24_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length24_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length24_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX-LABEL: length24_eq:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; X64-AVX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; X64-AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
+; X64-AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    sete %al
+; X64-AVX-NEXT:    retq
+; X64-MIC-AVX-LABEL: length24_eq:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %xmm1
+; X64-MIC-AVX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; X64-MIC-AVX-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm2, %k0
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
+; X64-MIC-AVX-NEXT:    sete %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length24_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length24_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb2:
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-SSE41-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-SSE41-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-SSE41-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-SSE41-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-SSE41-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length24_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb2:
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX1-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX1-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX1-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length24_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb2:
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX2-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length24_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb2:
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length24_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb2:
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length24_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb2:
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length24_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb2:
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length24_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb2:
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length24_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb2:
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length24_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length24_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb2:
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-SSE41-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-SSE41-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-SSE41-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-SSE41-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-SSE41-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length24_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb2:
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX1-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX1-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX1-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length24_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb2:
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX2-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length24_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb2:
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length24_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb2:
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length24_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb2:
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length24_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb2:
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length24_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb2:
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length24_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb2:
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_eq_const(ptr %X) nounwind {
+;
+; X64-LABEL: define i1 @length24_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-NEXT:    ret i1 [[TMP8]]
+;
+; X64-SSE41-LABEL: define i1 @length24_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX1-LABEL: define i1 @length24_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX2-LABEL: define i1 @length24_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length24_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX512BW-LABEL: define i1 @length24_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length24_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX512F-LABEL: define i1 @length24_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP8]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length24_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP8]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length24_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX-LABEL: length24_eq_const:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    setne %al
+; X64-AVX-NEXT:    retq
+; X64-MIC-AVX-LABEL: length24_eq_const:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-MIC-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [959985462,858927408,0,0]
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm1, %k0
+; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [858927408,926299444,825243960,892613426]
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
+; X64-MIC-AVX-NEXT:    setne %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 24) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length31(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length31(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64:       loadbb3:
+; X64-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-SSE41-LABEL: define i32 @length31(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb2:
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-SSE41-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-SSE41-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-SSE41-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-SSE41-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-SSE41-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb3:
+; X64-SSE41-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-SSE41-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-SSE41-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-SSE41-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-SSE41-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-SSE41-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-SSE41-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-SSE41-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length31(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb2:
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX1-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX1-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX1-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb3:
+; X64-AVX1-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX1-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX1-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX1-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX1-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX1-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX1-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length31(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb2:
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb3:
+; X64-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length31(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb2:
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb3:
+; X64-AVX512BW-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512BW-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512BW-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-LABEL: define i32 @length31(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb2:
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb3:
+; X64-AVX512BW-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512BW-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512BW-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length31(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb2:
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb3:
+; X64-AVX512F-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512F-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512F-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-LABEL: define i32 @length31(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb2:
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb3:
+; X64-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length31(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb2:
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb3:
+; X64-MIC-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-MIC-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-MIC-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length31(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb2:
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb3:
+; X64-MIC-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-MIC-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-MIC-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 31) nounwind
+  ret i32 %m
+}
+
+define i1 @length31_eq(ptr %x, ptr %y) nounwind {
+;
+; X64-LABEL: define i1 @length31_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length31_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length31_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length31_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length31_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length31_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length31_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length31_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length31_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length31_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX-LABEL: length31_eq:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
+; X64-AVX-NEXT:    vpxor 15(%rsi), %xmm1, %xmm1
+; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
+; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    sete %al
+; X64-AVX-NEXT:    retq
+; X64-MIC-AVX-LABEL: length31_eq:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-MIC-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
+; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %xmm2
+; X64-MIC-AVX-NEXT:    vmovdqu 15(%rsi), %xmm3
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm1, %k0
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm0, %k1
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
+; X64-MIC-AVX-NEXT:    sete %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length31_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64:       loadbb3:
+; X64-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length31_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb2:
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-SSE41-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-SSE41-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-SSE41-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-SSE41-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-SSE41-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb3:
+; X64-SSE41-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-SSE41-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-SSE41-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-SSE41-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-SSE41-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-SSE41-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-SSE41-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-SSE41-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length31_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb2:
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX1-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX1-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX1-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb3:
+; X64-AVX1-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX1-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX1-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX1-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX1-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX1-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX1-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length31_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb2:
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb3:
+; X64-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length31_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb2:
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb3:
+; X64-AVX512BW-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512BW-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512BW-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length31_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb2:
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb3:
+; X64-AVX512BW-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512BW-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512BW-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length31_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb2:
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb3:
+; X64-AVX512F-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512F-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512F-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length31_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb2:
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb3:
+; X64-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length31_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb2:
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb3:
+; X64-MIC-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-MIC-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-MIC-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length31_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb2:
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb3:
+; X64-MIC-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-MIC-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-MIC-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length31_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64:       loadbb3:
+; X64-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length31_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb2:
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-SSE41-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-SSE41-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-SSE41-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-SSE41-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-SSE41-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb3:
+; X64-SSE41-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-SSE41-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-SSE41-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-SSE41-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-SSE41-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-SSE41-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-SSE41-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-SSE41-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length31_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb2:
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX1-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX1-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX1-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb3:
+; X64-AVX1-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX1-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX1-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX1-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX1-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX1-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX1-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length31_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb2:
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb3:
+; X64-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length31_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb2:
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb3:
+; X64-AVX512BW-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512BW-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512BW-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length31_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb2:
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb3:
+; X64-AVX512BW-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512BW-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512BW-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length31_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb2:
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb3:
+; X64-AVX512F-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512F-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512F-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length31_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb2:
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb3:
+; X64-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length31_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb2:
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb3:
+; X64-MIC-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-MIC-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-MIC-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length31_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb2:
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb3:
+; X64-MIC-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-MIC-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-MIC-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
+;
+; X64-LABEL: define i1 @length31_eq_prefer128(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length31_eq_prefer128(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length31_eq_prefer128(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length31_eq_prefer128(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length31_eq_prefer128(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length31_eq_prefer128(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length31_eq_prefer128(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length31_eq_prefer128(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length31_eq_prefer128(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length31_eq_prefer128(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX-LABEL: length31_eq_prefer128:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
+; X64-AVX-NEXT:    vpxor 15(%rsi), %xmm1, %xmm1
+; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
+; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    sete %al
+; X64-AVX-NEXT:    retq
+; X64-MIC-AVX-LABEL: length31_eq_prefer128:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-MIC-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
+; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %xmm2
+; X64-MIC-AVX-NEXT:    vmovdqu 15(%rsi), %xmm3
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm1, %k0
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm0, %k1
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
+; X64-MIC-AVX-NEXT:    sete %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_eq_const(ptr %X) nounwind {
+;
+; X64-LABEL: define i1 @length31_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X64-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-NEXT:    ret i1 [[TMP7]]
+;
+; X64-SSE41-LABEL: define i1 @length31_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP7]]
+;
+; X64-AVX1-LABEL: define i1 @length31_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP7]]
+;
+; X64-AVX2-LABEL: define i1 @length31_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP7]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length31_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP7]]
+;
+; X64-AVX512BW-LABEL: define i1 @length31_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP7]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length31_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP7]]
+;
+; X64-AVX512F-LABEL: define i1 @length31_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP7]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length31_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP7]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length31_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP7]]
+;
+; X64-AVX-LABEL: length31_eq_const:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
+; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    setne %al
+; X64-AVX-NEXT:    retq
+; X64-MIC-AVX-LABEL: length31_eq_const:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-MIC-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
+; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [943142453,842084409,909456435,809056311]
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm1, %k0
+; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [858927408,926299444,825243960,892613426]
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
+; X64-MIC-AVX-NEXT:    setne %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 31) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length32(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length32(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64:       loadbb3:
+; X64-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-SSE41-LABEL: define i32 @length32(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb2:
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-SSE41-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-SSE41-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-SSE41-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-SSE41-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-SSE41-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb3:
+; X64-SSE41-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-SSE41-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-SSE41-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-SSE41-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-SSE41-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-SSE41-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-SSE41-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-SSE41-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length32(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb2:
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX1-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX1-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX1-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb3:
+; X64-AVX1-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX1-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX1-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX1-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX1-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX1-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX1-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length32(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb2:
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb3:
+; X64-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length32(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb2:
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb3:
+; X64-AVX512BW-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512BW-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512BW-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-LABEL: define i32 @length32(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb2:
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb3:
+; X64-AVX512BW-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512BW-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512BW-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length32(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb2:
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb3:
+; X64-AVX512F-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512F-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512F-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-LABEL: define i32 @length32(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb2:
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb3:
+; X64-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length32(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb2:
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb3:
+; X64-MIC-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-MIC-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-MIC-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length32(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb2:
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb3:
+; X64-MIC-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-MIC-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-MIC-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 32) nounwind
+  ret i32 %m
+}
+
+; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
+
+define i1 @length32_eq(ptr %x, ptr %y) nounwind {
+;
+; X64-LABEL: define i1 @length32_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length32_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length32_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = icmp ne i256 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length32_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i256 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length32_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = icmp ne i256 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length32_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = icmp ne i256 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length32_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = icmp ne i256 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length32_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i256 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length32_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i256 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length32_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i256 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512-LABEL: length32_eq:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX512-NEXT:    vpxor (%rsi), %ymm0, %ymm0
+; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
+; X64-AVX512-NEXT:    sete %al
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+; X64-MIC-AVX-LABEL: length32_eq:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %ymm1
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k0
+; X64-MIC-AVX-NEXT:    sete %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length32_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64:       loadbb3:
+; X64-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length32_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb2:
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-SSE41-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-SSE41-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-SSE41-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-SSE41-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-SSE41-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb3:
+; X64-SSE41-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-SSE41-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-SSE41-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-SSE41-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-SSE41-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-SSE41-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-SSE41-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-SSE41-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length32_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb2:
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX1-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX1-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX1-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb3:
+; X64-AVX1-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX1-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX1-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX1-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX1-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX1-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX1-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length32_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb2:
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb3:
+; X64-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length32_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb2:
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb3:
+; X64-AVX512BW-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512BW-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512BW-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length32_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb2:
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb3:
+; X64-AVX512BW-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512BW-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512BW-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length32_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb2:
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb3:
+; X64-AVX512F-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512F-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512F-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length32_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb2:
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb3:
+; X64-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length32_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb2:
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb3:
+; X64-MIC-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-MIC-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-MIC-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length32_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb2:
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb3:
+; X64-MIC-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-MIC-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-MIC-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length32_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64:       loadbb3:
+; X64-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length32_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb2:
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-SSE41-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-SSE41-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-SSE41-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-SSE41-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-SSE41-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb3:
+; X64-SSE41-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-SSE41-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-SSE41-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-SSE41-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-SSE41-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-SSE41-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-SSE41-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-SSE41-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length32_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb2:
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX1-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX1-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX1-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb3:
+; X64-AVX1-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX1-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX1-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX1-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX1-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX1-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX1-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length32_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb2:
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb3:
+; X64-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length32_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb2:
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb3:
+; X64-AVX512BW-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512BW-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512BW-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length32_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb2:
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb3:
+; X64-AVX512BW-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512BW-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512BW-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length32_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb2:
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb3:
+; X64-AVX512F-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512F-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512F-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length32_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb2:
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb3:
+; X64-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length32_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb2:
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb3:
+; X64-MIC-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-MIC-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-MIC-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length32_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb2:
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb3:
+; X64-MIC-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-MIC-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-MIC-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
+;
+; X64-LABEL: define i1 @length32_eq_prefer128(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length32_eq_prefer128(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length32_eq_prefer128(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length32_eq_prefer128(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length32_eq_prefer128(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length32_eq_prefer128(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length32_eq_prefer128(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length32_eq_prefer128(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length32_eq_prefer128(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length32_eq_prefer128(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX-LABEL: length32_eq_prefer128:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovdqu 16(%rdi), %xmm1
+; X64-AVX-NEXT:    vpxor 16(%rsi), %xmm1, %xmm1
+; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
+; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    sete %al
+; X64-AVX-NEXT:    retq
+; X64-MIC-AVX-LABEL: length32_eq_prefer128:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-MIC-AVX-NEXT:    vmovdqu 16(%rdi), %xmm1
+; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %xmm2
+; X64-MIC-AVX-NEXT:    vmovdqu 16(%rsi), %xmm3
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm1, %k0
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm0, %k1
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
+; X64-MIC-AVX-NEXT:    sete %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_const(ptr %X) nounwind {
+;
+; X64-LABEL: define i1 @length32_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X64-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-NEXT:    ret i1 [[TMP7]]
+;
+; X64-SSE41-LABEL: define i1 @length32_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP7]]
+;
+; X64-AVX1-LABEL: define i1 @length32_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = icmp ne i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX2-LABEL: define i1 @length32_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length32_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = icmp ne i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512BW-LABEL: define i1 @length32_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = icmp ne i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length32_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = icmp ne i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512F-LABEL: define i1 @length32_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = icmp ne i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP2]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length32_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP2]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length32_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = icmp ne i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512-LABEL: length32_eq_const:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
+; X64-AVX512-NEXT:    setne %al
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+; X64-MIC-AVX-LABEL: length32_eq_const:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [858927408,926299444,825243960,892613426,959985462,858927408,926299444,825243960]
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k0
+; X64-MIC-AVX-NEXT:    setne %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 32) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length48(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length48(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length48(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5:[0-9]+]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length48(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5:[0-9]+]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length48(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5:[0-9]+]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length48(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5:[0-9]+]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length48(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5:[0-9]+]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length48(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5:[0-9]+]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length48(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5:[0-9]+]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length48(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5:[0-9]+]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length48(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5:[0-9]+]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 48) nounwind
+  ret i32 %m
+}
+
+define i1 @length48_eq(ptr %x, ptr %y) nounwind {
+; X64-SSE-LABEL: length48_eq:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $48, %edx
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    sete %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length48_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length48_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length48_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i128 [[TMP6]] to i256
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i128 [[TMP7]] to i256
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = xor i256 [[TMP8]], [[TMP9]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = or i256 [[TMP3]], [[TMP10]]
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = icmp ne i256 [[TMP11]], 0
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length48_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i128 [[TMP6]] to i256
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i128 [[TMP7]] to i256
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = xor i256 [[TMP8]], [[TMP9]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = or i256 [[TMP3]], [[TMP10]]
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i256 [[TMP11]], 0
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length48_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i128 [[TMP6]] to i256
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = zext i128 [[TMP7]] to i256
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = xor i256 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = or i256 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = icmp ne i256 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length48_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i128 [[TMP6]] to i256
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = zext i128 [[TMP7]] to i256
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = xor i256 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = or i256 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = icmp ne i256 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length48_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i128 [[TMP6]] to i256
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = zext i128 [[TMP7]] to i256
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = xor i256 [[TMP8]], [[TMP9]]
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = or i256 [[TMP3]], [[TMP10]]
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = icmp ne i256 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length48_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i128 [[TMP6]] to i256
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = zext i128 [[TMP7]] to i256
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = xor i256 [[TMP8]], [[TMP9]]
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = or i256 [[TMP3]], [[TMP10]]
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i256 [[TMP11]], 0
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length48_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i128 [[TMP6]] to i256
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = zext i128 [[TMP7]] to i256
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = xor i256 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = or i256 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i256 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length48_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i128 [[TMP6]] to i256
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = zext i128 [[TMP7]] to i256
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = xor i256 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = or i256 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i256 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512-LABEL: length48_eq:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX512-NEXT:    vmovdqu 32(%rdi), %xmm1
+; X64-AVX512-NEXT:    vmovdqu 32(%rsi), %xmm2
+; X64-AVX512-NEXT:    vpxor (%rsi), %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpxor %ymm2, %ymm1, %ymm1
+; X64-AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
+; X64-AVX512-NEXT:    sete %al
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+; X64-MIC-AVX-LABEL: length48_eq:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %ymm1
+; X64-MIC-AVX-NEXT:    vmovdqu 32(%rdi), %xmm2
+; X64-MIC-AVX-NEXT:    vmovdqu 32(%rsi), %xmm3
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm2, %k0
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
+; X64-MIC-AVX-NEXT:    sete %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length48_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length48_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length48_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length48_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length48_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length48_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length48_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length48_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length48_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length48_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length48_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length48_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length48_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length48_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length48_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length48_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length48_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length48_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length48_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length48_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
+; X64-LABEL: define i1 @length48_eq_prefer128(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length48_eq_prefer128(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length48_eq_prefer128(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length48_eq_prefer128(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length48_eq_prefer128(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length48_eq_prefer128(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length48_eq_prefer128(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length48_eq_prefer128(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length48_eq_prefer128(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length48_eq_prefer128(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_eq_const(ptr %X) nounwind {
+; X64-SSE-LABEL: length48_eq_const:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $.L.str, %esi
+; X64-SSE-NEXT:    movl $48, %edx
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    setne %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length48_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X64-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP6]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP7]], 73389002901949112059321871464991568690
+; X64-NEXT:    [[TMP9:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-NEXT:    [[TMP10:%.*]] = or i128 [[TMP9]], [[TMP8]]
+; X64-NEXT:    [[TMP11:%.*]] = icmp ne i128 [[TMP10]], 0
+; X64-NEXT:    [[TMP12:%.*]] = zext i1 [[TMP11]] to i32
+; X64-NEXT:    ret i1 [[TMP11]]
+;
+; X64-SSE41-LABEL: define i1 @length48_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP6]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP7]], 73389002901949112059321871464991568690
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = or i128 [[TMP9]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = icmp ne i128 [[TMP10]], 0
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = zext i1 [[TMP11]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP11]]
+;
+; X64-AVX1-LABEL: define i1 @length48_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = zext i128 [[TMP4]] to i256
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = xor i256 [[TMP5]], 73389002901949112059321871464991568690
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = or i256 [[TMP2]], [[TMP6]]
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = icmp ne i256 [[TMP7]], 0
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX2-LABEL: define i1 @length48_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = zext i128 [[TMP4]] to i256
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = xor i256 [[TMP5]], 73389002901949112059321871464991568690
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = or i256 [[TMP2]], [[TMP6]]
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = icmp ne i256 [[TMP7]], 0
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length48_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = zext i128 [[TMP4]] to i256
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = xor i256 [[TMP5]], 73389002901949112059321871464991568690
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = or i256 [[TMP2]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = icmp ne i256 [[TMP7]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX512BW-LABEL: define i1 @length48_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = zext i128 [[TMP4]] to i256
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = xor i256 [[TMP5]], 73389002901949112059321871464991568690
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = or i256 [[TMP2]], [[TMP6]]
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = icmp ne i256 [[TMP7]], 0
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length48_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = zext i128 [[TMP4]] to i256
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = xor i256 [[TMP5]], 73389002901949112059321871464991568690
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = or i256 [[TMP2]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = icmp ne i256 [[TMP7]], 0
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX512F-LABEL: define i1 @length48_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = zext i128 [[TMP4]] to i256
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = xor i256 [[TMP5]], 73389002901949112059321871464991568690
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = or i256 [[TMP2]], [[TMP6]]
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = icmp ne i256 [[TMP7]], 0
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP8]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length48_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = zext i128 [[TMP4]] to i256
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = xor i256 [[TMP5]], 73389002901949112059321871464991568690
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = or i256 [[TMP2]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = icmp ne i256 [[TMP7]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP8]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length48_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = zext i128 [[TMP4]] to i256
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = xor i256 [[TMP5]], 73389002901949112059321871464991568690
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = or i256 [[TMP2]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = icmp ne i256 [[TMP7]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX512-LABEL: length48_eq_const:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX512-NEXT:    vmovdqu 32(%rdi), %xmm1
+; X64-AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; X64-AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
+; X64-AVX512-NEXT:    setne %al
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+; X64-MIC-AVX-LABEL: length48_eq_const:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-MIC-AVX-NEXT:    vmovdqu 32(%rdi), %xmm1
+; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} ymm2 = [892613426,959985462,858927408,926299444,0,0,0,0]
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm1, %k0
+; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [858927408,926299444,825243960,892613426,959985462,858927408,926299444,825243960]
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
+; X64-MIC-AVX-NEXT:    setne %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 48) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length63(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length63(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length63(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length63(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length63(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length63(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length63(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length63(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length63(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length63(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length63(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 63) nounwind
+  ret i32 %m
+}
+
+define i1 @length63_eq(ptr %x, ptr %y) nounwind {
+; X64-SSE-LABEL: length63_eq:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $63, %edx
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    setne %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length63_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 47
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 47
+; X64-NEXT:    [[TMP16:%.*]] = load i128, ptr [[TMP14]], align 1
+; X64-NEXT:    [[TMP17:%.*]] = load i128, ptr [[TMP15]], align 1
+; X64-NEXT:    [[TMP18:%.*]] = xor i128 [[TMP16]], [[TMP17]]
+; X64-NEXT:    [[TMP19:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP20:%.*]] = or i128 [[TMP13]], [[TMP18]]
+; X64-NEXT:    [[TMP21:%.*]] = or i128 [[TMP19]], [[TMP20]]
+; X64-NEXT:    [[TMP22:%.*]] = icmp ne i128 [[TMP21]], 0
+; X64-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-NEXT:    ret i1 [[TMP22]]
+;
+; X64-SSE41-LABEL: define i1 @length63_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 47
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 47
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = load i128, ptr [[TMP14]], align 1
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = load i128, ptr [[TMP15]], align 1
+; X64-SSE41-NEXT:    [[TMP18:%.*]] = xor i128 [[TMP16]], [[TMP17]]
+; X64-SSE41-NEXT:    [[TMP19:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP20:%.*]] = or i128 [[TMP13]], [[TMP18]]
+; X64-SSE41-NEXT:    [[TMP21:%.*]] = or i128 [[TMP19]], [[TMP20]]
+; X64-SSE41-NEXT:    [[TMP22:%.*]] = icmp ne i128 [[TMP21]], 0
+; X64-SSE41-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX1-LABEL: define i1 @length63_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 31
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX2-LABEL: define i1 @length63_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 31
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length63_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 31
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512BW-LABEL: define i1 @length63_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 31
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length63_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 31
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512F-LABEL: define i1 @length63_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 31
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP10]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length63_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 31
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP10]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length63_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 31
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512-LABEL: length63_eq:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX512-NEXT:    vmovdqu 31(%rdi), %ymm1
+; X64-AVX512-NEXT:    vpxor 31(%rsi), %ymm1, %ymm1
+; X64-AVX512-NEXT:    vpxor (%rsi), %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
+; X64-AVX512-NEXT:    setne %al
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+; X64-MIC-AVX-LABEL: length63_eq:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-MIC-AVX-NEXT:    vmovdqu 31(%rdi), %ymm1
+; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %ymm2
+; X64-MIC-AVX-NEXT:    vmovdqu 31(%rsi), %ymm3
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm1, %k0
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm0, %k1
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
+; X64-MIC-AVX-NEXT:    setne %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length63_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length63_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length63_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length63_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length63_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length63_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length63_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length63_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length63_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length63_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length63_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length63_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length63_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length63_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length63_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length63_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length63_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length63_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length63_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length63_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length63_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length63_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length63_eq_const(ptr %X) nounwind {
+; X64-SSE-LABEL: length63_eq_const:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $.L.str, %esi
+; X64-SSE-NEXT:    movl $63, %edx
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    sete %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length63_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X64-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP6]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP7]], 73389002901949112059321871464991568690
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 47
+; X64-NEXT:    [[TMP10:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = xor i128 [[TMP10]], 66716800424378146251538984255488604215
+; X64-NEXT:    [[TMP12:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-NEXT:    [[TMP13:%.*]] = or i128 [[TMP8]], [[TMP11]]
+; X64-NEXT:    [[TMP14:%.*]] = or i128 [[TMP12]], [[TMP13]]
+; X64-NEXT:    [[TMP15:%.*]] = icmp ne i128 [[TMP14]], 0
+; X64-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length63_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP6]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP7]], 73389002901949112059321871464991568690
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 47
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = xor i128 [[TMP10]], 66716800424378146251538984255488604215
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = or i128 [[TMP8]], [[TMP11]]
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = or i128 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = icmp ne i128 [[TMP14]], 0
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length63_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 22702550761799267355187145649125784605216755694630776232256222584591002841649
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length63_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 22702550761799267355187145649125784605216755694630776232256222584591002841649
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length63_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 22702550761799267355187145649125784605216755694630776232256222584591002841649
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length63_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 22702550761799267355187145649125784605216755694630776232256222584591002841649
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length63_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 22702550761799267355187145649125784605216755694630776232256222584591002841649
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length63_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 22702550761799267355187145649125784605216755694630776232256222584591002841649
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length63_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 22702550761799267355187145649125784605216755694630776232256222584591002841649
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length63_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 22702550761799267355187145649125784605216755694630776232256222584591002841649
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512-LABEL: length63_eq_const:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX512-NEXT:    vmovdqu 31(%rdi), %ymm1
+; X64-AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; X64-AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
+; X64-AVX512-NEXT:    sete %al
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+; X64-MIC-AVX-LABEL: length63_eq_const:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-MIC-AVX-NEXT:    vmovdqu 31(%rdi), %ymm1
+; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} ymm2 = [875770417,943142453,842084409,909456435,809056311,875770417,943142453,842084409]
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm1, %k0
+; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [858927408,926299444,825243960,892613426,959985462,858927408,926299444,825243960]
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
+; X64-MIC-AVX-NEXT:    sete %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 63) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length64(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length64(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length64(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length64(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length64(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length64(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length64(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length64(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length64(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length64(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length64(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 64) nounwind
+  ret i32 %m
+}
+
+define i1 @length64_eq(ptr %x, ptr %y) nounwind {
+; X64-SSE-LABEL: length64_eq:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $64, %edx
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    setne %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length64_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 48
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 48
+; X64-NEXT:    [[TMP16:%.*]] = load i128, ptr [[TMP14]], align 1
+; X64-NEXT:    [[TMP17:%.*]] = load i128, ptr [[TMP15]], align 1
+; X64-NEXT:    [[TMP18:%.*]] = xor i128 [[TMP16]], [[TMP17]]
+; X64-NEXT:    [[TMP19:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP20:%.*]] = or i128 [[TMP13]], [[TMP18]]
+; X64-NEXT:    [[TMP21:%.*]] = or i128 [[TMP19]], [[TMP20]]
+; X64-NEXT:    [[TMP22:%.*]] = icmp ne i128 [[TMP21]], 0
+; X64-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-NEXT:    ret i1 [[TMP22]]
+;
+; X64-SSE41-LABEL: define i1 @length64_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 48
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 48
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = load i128, ptr [[TMP14]], align 1
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = load i128, ptr [[TMP15]], align 1
+; X64-SSE41-NEXT:    [[TMP18:%.*]] = xor i128 [[TMP16]], [[TMP17]]
+; X64-SSE41-NEXT:    [[TMP19:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP20:%.*]] = or i128 [[TMP13]], [[TMP18]]
+; X64-SSE41-NEXT:    [[TMP21:%.*]] = or i128 [[TMP19]], [[TMP20]]
+; X64-SSE41-NEXT:    [[TMP22:%.*]] = icmp ne i128 [[TMP21]], 0
+; X64-SSE41-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX1-LABEL: define i1 @length64_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX2-LABEL: define i1 @length64_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length64_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512BW-LABEL: define i1 @length64_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = icmp ne i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length64_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512F-LABEL: define i1 @length64_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP3]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length64_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP10]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length64_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX512-LABEL: length64_eq:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vmovdqu64 (%rdi), %zmm0
+; X64-AVX512-NEXT:    vpcmpneqd (%rsi), %zmm0, %k0
+; X64-AVX512-NEXT:    kortestw %k0, %k0
+; X64-AVX512-NEXT:    setne %al
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length64_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length64_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length64_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length64_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length64_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length64_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length64_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length64_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length64_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length64_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length64_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length64_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length64_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length64_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length64_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length64_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length64_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length64_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length64_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length64_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_eq_const(ptr %X) nounwind {
+; X64-SSE-LABEL: length64_eq_const:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $.L.str, %esi
+; X64-SSE-NEXT:    movl $64, %edx
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    sete %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length64_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X64-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP6]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP7]], 73389002901949112059321871464991568690
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 48
+; X64-NEXT:    [[TMP10:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = xor i128 [[TMP10]], 68051240286688436651889234231545575736
+; X64-NEXT:    [[TMP12:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-NEXT:    [[TMP13:%.*]] = or i128 [[TMP8]], [[TMP11]]
+; X64-NEXT:    [[TMP14:%.*]] = or i128 [[TMP12]], [[TMP13]]
+; X64-NEXT:    [[TMP15:%.*]] = icmp ne i128 [[TMP14]], 0
+; X64-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length64_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP6]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP7]], 73389002901949112059321871464991568690
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 48
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = xor i128 [[TMP10]], 68051240286688436651889234231545575736
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = or i128 [[TMP8]], [[TMP11]]
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = or i128 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = icmp ne i128 [[TMP14]], 0
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length64_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length64_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length64_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length64_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = icmp ne i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length64_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length64_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length64_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length64_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512-LABEL: length64_eq_const:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vmovdqu64 (%rdi), %zmm0
+; X64-AVX512-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k0
+; X64-AVX512-NEXT:    kortestw %k0, %k0
+; X64-AVX512-NEXT:    sete %al
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 64) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length96(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length96(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length96(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length96(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length96(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length96(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length96(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length96(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length96(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length96(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length96(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 96) nounwind
+  ret i32 %m
+}
+
+define i1 @length96_eq(ptr %x, ptr %y) nounwind {
+; X64-SSE-LABEL: length96_eq:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $96, %edx
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    setne %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length96_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length96_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length96_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = or i256 [[TMP14]], [[TMP13]]
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = icmp ne i256 [[TMP15]], 0
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP16]]
+;
+; X64-AVX2-LABEL: define i1 @length96_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = or i256 [[TMP14]], [[TMP13]]
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = icmp ne i256 [[TMP15]], 0
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP16]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length96_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = or i256 [[TMP14]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = icmp ne i256 [[TMP15]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP16]]
+;
+; X64-AVX512BW-LABEL: define i1 @length96_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i256 [[TMP6]] to i512
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = zext i256 [[TMP7]] to i512
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = xor i512 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = or i512 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = icmp ne i512 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length96_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = or i256 [[TMP14]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = icmp ne i256 [[TMP15]], 0
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP16]]
+;
+; X64-AVX512F-LABEL: define i1 @length96_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i256 [[TMP6]] to i512
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = zext i256 [[TMP7]] to i512
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = xor i512 [[TMP8]], [[TMP9]]
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = or i512 [[TMP3]], [[TMP10]]
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i512 [[TMP11]], 0
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP12]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length96_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = or i256 [[TMP14]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = icmp ne i256 [[TMP15]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP16]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length96_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i256 [[TMP6]] to i512
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = zext i256 [[TMP7]] to i512
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = xor i512 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = or i512 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i512 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP12]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length96_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length96_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length96_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length96_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length96_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length96_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length96_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length96_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length96_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length96_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length96_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length96_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length96_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length96_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length96_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length96_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length96_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length96_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length96_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length96_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length96_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length96_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length96_eq_const(ptr %X) nounwind {
+; X64-SSE-LABEL: length96_eq_const:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $.L.str, %esi
+; X64-SSE-NEXT:    movl $96, %edx
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    sete %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length96_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 96) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length96_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 96) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length96_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = or i256 [[TMP9]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = icmp ne i256 [[TMP10]], 0
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = zext i1 [[TMP11]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP12]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length96_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = or i256 [[TMP9]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = icmp ne i256 [[TMP10]], 0
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = zext i1 [[TMP11]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP12]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length96_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = or i256 [[TMP9]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = icmp ne i256 [[TMP10]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = zext i1 [[TMP11]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP12]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length96_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = zext i256 [[TMP5]] to i512
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP6]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = icmp ne i512 [[TMP8]], 0
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP9]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP10]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length96_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = or i256 [[TMP9]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = icmp ne i256 [[TMP10]], 0
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = zext i1 [[TMP11]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP12]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length96_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = zext i256 [[TMP5]] to i512
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP6]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = icmp ne i512 [[TMP8]], 0
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP9]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP10]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length96_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = or i256 [[TMP9]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = icmp ne i256 [[TMP10]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = zext i1 [[TMP11]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP12]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length96_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = zext i256 [[TMP5]] to i512
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP6]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = icmp ne i512 [[TMP8]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP9]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP10]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 96) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length127(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length127(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length127(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length127(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length127(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length127(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length127(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length127(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length127(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length127(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length127(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 127) nounwind
+  ret i32 %m
+}
+
+define i1 @length127_eq(ptr %x, ptr %y) nounwind {
+; X64-SSE-LABEL: length127_eq:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $127, %edx
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    setne %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length127_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length127_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length127_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 95
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 95
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = load i256, ptr [[TMP14]], align 1
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = load i256, ptr [[TMP15]], align 1
+; X64-AVX1-NEXT:    [[TMP18:%.*]] = xor i256 [[TMP16]], [[TMP17]]
+; X64-AVX1-NEXT:    [[TMP19:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP20:%.*]] = or i256 [[TMP13]], [[TMP18]]
+; X64-AVX1-NEXT:    [[TMP21:%.*]] = or i256 [[TMP19]], [[TMP20]]
+; X64-AVX1-NEXT:    [[TMP22:%.*]] = icmp ne i256 [[TMP21]], 0
+; X64-AVX1-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX2-LABEL: define i1 @length127_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 95
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 95
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = load i256, ptr [[TMP14]], align 1
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = load i256, ptr [[TMP15]], align 1
+; X64-AVX2-NEXT:    [[TMP18:%.*]] = xor i256 [[TMP16]], [[TMP17]]
+; X64-AVX2-NEXT:    [[TMP19:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP20:%.*]] = or i256 [[TMP13]], [[TMP18]]
+; X64-AVX2-NEXT:    [[TMP21:%.*]] = or i256 [[TMP19]], [[TMP20]]
+; X64-AVX2-NEXT:    [[TMP22:%.*]] = icmp ne i256 [[TMP21]], 0
+; X64-AVX2-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length127_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 95
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 95
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = load i256, ptr [[TMP14]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = load i256, ptr [[TMP15]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP18:%.*]] = xor i256 [[TMP16]], [[TMP17]]
+; X64-AVX512BW-256-NEXT:    [[TMP19:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP20:%.*]] = or i256 [[TMP13]], [[TMP18]]
+; X64-AVX512BW-256-NEXT:    [[TMP21:%.*]] = or i256 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-256-NEXT:    [[TMP22:%.*]] = icmp ne i256 [[TMP21]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX512BW-LABEL: define i1 @length127_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 63
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 63
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = icmp ne i512 [[TMP9]], 0
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length127_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 95
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 95
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = load i256, ptr [[TMP14]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = load i256, ptr [[TMP15]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP18:%.*]] = xor i256 [[TMP16]], [[TMP17]]
+; X64-AVX512F-256-NEXT:    [[TMP19:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP20:%.*]] = or i256 [[TMP13]], [[TMP18]]
+; X64-AVX512F-256-NEXT:    [[TMP21:%.*]] = or i256 [[TMP19]], [[TMP20]]
+; X64-AVX512F-256-NEXT:    [[TMP22:%.*]] = icmp ne i256 [[TMP21]], 0
+; X64-AVX512F-256-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX512F-LABEL: define i1 @length127_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 63
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 63
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i512 [[TMP9]], 0
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP10]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length127_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 95
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 95
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = load i256, ptr [[TMP14]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = load i256, ptr [[TMP15]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP18:%.*]] = xor i256 [[TMP16]], [[TMP17]]
+; X64-MIC-AVX2-NEXT:    [[TMP19:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP20:%.*]] = or i256 [[TMP13]], [[TMP18]]
+; X64-MIC-AVX2-NEXT:    [[TMP21:%.*]] = or i256 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX2-NEXT:    [[TMP22:%.*]] = icmp ne i256 [[TMP21]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP22]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length127_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 63
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 63
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i512 [[TMP9]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP10]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length127_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length127_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length127_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length127_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length127_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length127_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length127_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length127_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length127_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length127_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length127_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length127_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length127_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length127_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length127_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length127_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length127_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length127_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length127_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length127_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length127_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length127_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length127_eq_const(ptr %X) nounwind {
+; X64-SSE-LABEL: length127_eq_const:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $.L.str, %esi
+; X64-SSE-NEXT:    movl $127, %edx
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    sete %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length127_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 127) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length127_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 127) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length127_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 95
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = xor i256 [[TMP10]], 24518896988982801982081367250212210778372643504230047123819838724519570650677
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = or i256 [[TMP8]], [[TMP11]]
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = or i256 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = icmp ne i256 [[TMP14]], 0
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length127_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 95
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = xor i256 [[TMP10]], 24518896988982801982081367250212210778372643504230047123819838724519570650677
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = or i256 [[TMP8]], [[TMP11]]
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = or i256 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = icmp ne i256 [[TMP14]], 0
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length127_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 95
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = xor i256 [[TMP10]], 24518896988982801982081367250212210778372643504230047123819838724519570650677
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = or i256 [[TMP8]], [[TMP11]]
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = or i256 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = icmp ne i256 [[TMP14]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length127_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 63
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 63), align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = icmp ne i512 [[TMP8]], 0
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP9]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP10]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length127_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 95
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = xor i256 [[TMP10]], 24518896988982801982081367250212210778372643504230047123819838724519570650677
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = or i256 [[TMP8]], [[TMP11]]
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = or i256 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = icmp ne i256 [[TMP14]], 0
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length127_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 63
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 63), align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = icmp ne i512 [[TMP8]], 0
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP9]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP10]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length127_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 95
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = xor i256 [[TMP10]], 24518896988982801982081367250212210778372643504230047123819838724519570650677
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = or i256 [[TMP8]], [[TMP11]]
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = or i256 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = icmp ne i256 [[TMP14]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length127_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 63
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 63), align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = icmp ne i512 [[TMP8]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP9]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP10]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 127) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length128(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length128(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length128(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length128(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length128(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length128(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length128(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length128(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length128(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length128(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length128(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 128) nounwind
+  ret i32 %m
+}
+
+define i1 @length128_eq(ptr %x, ptr %y) nounwind {
+; X64-SSE-LABEL: length128_eq:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $128, %edx
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    setne %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length128_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length128_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length128_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 96
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 96
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = load i256, ptr [[TMP14]], align 1
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = load i256, ptr [[TMP15]], align 1
+; X64-AVX1-NEXT:    [[TMP18:%.*]] = xor i256 [[TMP16]], [[TMP17]]
+; X64-AVX1-NEXT:    [[TMP19:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP20:%.*]] = or i256 [[TMP13]], [[TMP18]]
+; X64-AVX1-NEXT:    [[TMP21:%.*]] = or i256 [[TMP19]], [[TMP20]]
+; X64-AVX1-NEXT:    [[TMP22:%.*]] = icmp ne i256 [[TMP21]], 0
+; X64-AVX1-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX2-LABEL: define i1 @length128_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 96
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 96
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = load i256, ptr [[TMP14]], align 1
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = load i256, ptr [[TMP15]], align 1
+; X64-AVX2-NEXT:    [[TMP18:%.*]] = xor i256 [[TMP16]], [[TMP17]]
+; X64-AVX2-NEXT:    [[TMP19:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP20:%.*]] = or i256 [[TMP13]], [[TMP18]]
+; X64-AVX2-NEXT:    [[TMP21:%.*]] = or i256 [[TMP19]], [[TMP20]]
+; X64-AVX2-NEXT:    [[TMP22:%.*]] = icmp ne i256 [[TMP21]], 0
+; X64-AVX2-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length128_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 96
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 96
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = load i256, ptr [[TMP14]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = load i256, ptr [[TMP15]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP18:%.*]] = xor i256 [[TMP16]], [[TMP17]]
+; X64-AVX512BW-256-NEXT:    [[TMP19:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP20:%.*]] = or i256 [[TMP13]], [[TMP18]]
+; X64-AVX512BW-256-NEXT:    [[TMP21:%.*]] = or i256 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-256-NEXT:    [[TMP22:%.*]] = icmp ne i256 [[TMP21]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX512BW-LABEL: define i1 @length128_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = icmp ne i512 [[TMP9]], 0
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length128_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 96
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 96
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = load i256, ptr [[TMP14]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = load i256, ptr [[TMP15]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP18:%.*]] = xor i256 [[TMP16]], [[TMP17]]
+; X64-AVX512F-256-NEXT:    [[TMP19:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP20:%.*]] = or i256 [[TMP13]], [[TMP18]]
+; X64-AVX512F-256-NEXT:    [[TMP21:%.*]] = or i256 [[TMP19]], [[TMP20]]
+; X64-AVX512F-256-NEXT:    [[TMP22:%.*]] = icmp ne i256 [[TMP21]], 0
+; X64-AVX512F-256-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX512F-LABEL: define i1 @length128_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i512 [[TMP9]], 0
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP10]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length128_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 96
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 96
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = load i256, ptr [[TMP14]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = load i256, ptr [[TMP15]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP18:%.*]] = xor i256 [[TMP16]], [[TMP17]]
+; X64-MIC-AVX2-NEXT:    [[TMP19:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP20:%.*]] = or i256 [[TMP13]], [[TMP18]]
+; X64-MIC-AVX2-NEXT:    [[TMP21:%.*]] = or i256 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX2-NEXT:    [[TMP22:%.*]] = icmp ne i256 [[TMP21]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP22]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length128_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i512 [[TMP9]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP10]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length128_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length128_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length128_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length128_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length128_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length128_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length128_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length128_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length128_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length128_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length128_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length128_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length128_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length128_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length128_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length128_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length128_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length128_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length128_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length128_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length128_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length128_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length128_eq_const(ptr %X) nounwind {
+; X64-SSE-LABEL: length128_eq_const:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $.L.str, %esi
+; X64-SSE-NEXT:    movl $128, %edx
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    sete %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length128_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 128) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length128_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 128) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length128_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 96
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = xor i256 [[TMP10]], 24972983613442865430775334151281434151203991406697113551929636559217741018934
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = or i256 [[TMP8]], [[TMP11]]
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = or i256 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = icmp ne i256 [[TMP14]], 0
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length128_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 96
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = xor i256 [[TMP10]], 24972983613442865430775334151281434151203991406697113551929636559217741018934
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = or i256 [[TMP8]], [[TMP11]]
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = or i256 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = icmp ne i256 [[TMP14]], 0
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length128_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 96
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = xor i256 [[TMP10]], 24972983613442865430775334151281434151203991406697113551929636559217741018934
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = or i256 [[TMP8]], [[TMP11]]
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = or i256 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = icmp ne i256 [[TMP14]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length128_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = icmp ne i512 [[TMP8]], 0
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP9]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP10]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length128_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 96
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = xor i256 [[TMP10]], 24972983613442865430775334151281434151203991406697113551929636559217741018934
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = or i256 [[TMP8]], [[TMP11]]
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = or i256 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = icmp ne i256 [[TMP14]], 0
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length128_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = icmp ne i512 [[TMP8]], 0
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP9]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP10]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length128_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 96
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = xor i256 [[TMP10]], 24972983613442865430775334151281434151203991406697113551929636559217741018934
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = or i256 [[TMP8]], [[TMP11]]
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = or i256 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = icmp ne i256 [[TMP14]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length128_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = icmp ne i512 [[TMP8]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP9]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP10]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 128) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length192(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length192(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length192(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length192(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length192(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length192(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length192(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length192(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length192(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length192(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length192(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 192) nounwind
+  ret i32 %m
+}
+
+define i1 @length192_eq(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length192_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length192_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length192_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length192_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length192_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length192_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 128
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i512, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = load i512, ptr [[TMP10]], align 1
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = xor i512 [[TMP11]], [[TMP12]]
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = or i512 [[TMP14]], [[TMP13]]
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = icmp ne i512 [[TMP15]], 0
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP16]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length192_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length192_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 128
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i512, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = load i512, ptr [[TMP10]], align 1
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = xor i512 [[TMP11]], [[TMP12]]
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = or i512 [[TMP14]], [[TMP13]]
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = icmp ne i512 [[TMP15]], 0
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP16]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length192_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length192_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 128
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i512, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = load i512, ptr [[TMP10]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = xor i512 [[TMP11]], [[TMP12]]
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = or i512 [[TMP14]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = icmp ne i512 [[TMP15]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP16]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length192_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length192_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length192_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length192_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length192_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length192_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length192_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length192_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length192_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length192_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length192_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length192_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length192_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length192_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length192_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length192_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length192_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length192_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length192_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length192_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length192_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length192_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length192_eq_const(ptr %X) nounwind {
+; X64-LABEL: define i1 @length192_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 192) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length192_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 192) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length192_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 192) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length192_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 192) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length192_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 192) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length192_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = load i512, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 128), align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = xor i512 [[TMP9]], [[TMP10]]
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = or i512 [[TMP12]], [[TMP11]]
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp ne i512 [[TMP13]], 0
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP15]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length192_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 192) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length192_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = load i512, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 128), align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = xor i512 [[TMP9]], [[TMP10]]
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = or i512 [[TMP12]], [[TMP11]]
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp ne i512 [[TMP13]], 0
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP15]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length192_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 192) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length192_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = load i512, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 128), align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = xor i512 [[TMP9]], [[TMP10]]
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = or i512 [[TMP12]], [[TMP11]]
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp ne i512 [[TMP13]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP15]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 192) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length255(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length255(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length255(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length255(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length255(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length255(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length255(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length255(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length255(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length255(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length255(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 255) nounwind
+  ret i32 %m
+}
+
+define i1 @length255_eq(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length255_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length255_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length255_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length255_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length255_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length255_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 128
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i512, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = load i512, ptr [[TMP10]], align 1
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = xor i512 [[TMP11]], [[TMP12]]
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 191
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 191
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = load i512, ptr [[TMP14]], align 1
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = load i512, ptr [[TMP15]], align 1
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = xor i512 [[TMP16]], [[TMP17]]
+; X64-AVX512BW-NEXT:    [[TMP19:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP20:%.*]] = or i512 [[TMP13]], [[TMP18]]
+; X64-AVX512BW-NEXT:    [[TMP21:%.*]] = or i512 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-NEXT:    [[TMP22:%.*]] = icmp ne i512 [[TMP21]], 0
+; X64-AVX512BW-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length255_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length255_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 128
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i512, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = load i512, ptr [[TMP10]], align 1
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = xor i512 [[TMP11]], [[TMP12]]
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 191
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 191
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = load i512, ptr [[TMP14]], align 1
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = load i512, ptr [[TMP15]], align 1
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = xor i512 [[TMP16]], [[TMP17]]
+; X64-AVX512F-NEXT:    [[TMP19:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP20:%.*]] = or i512 [[TMP13]], [[TMP18]]
+; X64-AVX512F-NEXT:    [[TMP21:%.*]] = or i512 [[TMP19]], [[TMP20]]
+; X64-AVX512F-NEXT:    [[TMP22:%.*]] = icmp ne i512 [[TMP21]], 0
+; X64-AVX512F-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP22]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length255_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length255_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 128
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i512, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = load i512, ptr [[TMP10]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = xor i512 [[TMP11]], [[TMP12]]
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 191
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 191
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = load i512, ptr [[TMP14]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = load i512, ptr [[TMP15]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = xor i512 [[TMP16]], [[TMP17]]
+; X64-MIC-AVX512F-NEXT:    [[TMP19:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP20:%.*]] = or i512 [[TMP13]], [[TMP18]]
+; X64-MIC-AVX512F-NEXT:    [[TMP21:%.*]] = or i512 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX512F-NEXT:    [[TMP22:%.*]] = icmp ne i512 [[TMP21]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP22]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length255_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length255_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length255_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length255_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length255_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length255_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length255_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length255_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length255_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length255_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length255_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length255_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length255_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length255_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length255_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length255_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length255_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length255_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length255_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length255_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length255_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length255_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length255_eq_const(ptr %X) nounwind {
+; X64-LABEL: define i1 @length255_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 255) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length255_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 255) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length255_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 255) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length255_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 255) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length255_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 255) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length255_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = load i512, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 128), align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = xor i512 [[TMP9]], [[TMP10]]
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[X]], i64 191
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = load i512, ptr [[TMP12]], align 1
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 191), align 1
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = xor i512 [[TMP13]], [[TMP14]]
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = or i512 [[TMP11]], [[TMP15]]
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = or i512 [[TMP16]], [[TMP17]]
+; X64-AVX512BW-NEXT:    [[TMP19:%.*]] = icmp ne i512 [[TMP18]], 0
+; X64-AVX512BW-NEXT:    [[TMP20:%.*]] = zext i1 [[TMP19]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP20]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length255_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 255) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length255_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = load i512, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 128), align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = xor i512 [[TMP9]], [[TMP10]]
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[X]], i64 191
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = load i512, ptr [[TMP12]], align 1
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 191), align 1
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = xor i512 [[TMP13]], [[TMP14]]
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = or i512 [[TMP11]], [[TMP15]]
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = or i512 [[TMP16]], [[TMP17]]
+; X64-AVX512F-NEXT:    [[TMP19:%.*]] = icmp ne i512 [[TMP18]], 0
+; X64-AVX512F-NEXT:    [[TMP20:%.*]] = zext i1 [[TMP19]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP20]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length255_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 255) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length255_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = load i512, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 128), align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = xor i512 [[TMP9]], [[TMP10]]
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[X]], i64 191
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = load i512, ptr [[TMP12]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 191), align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = xor i512 [[TMP13]], [[TMP14]]
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = or i512 [[TMP11]], [[TMP15]]
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = or i512 [[TMP16]], [[TMP17]]
+; X64-MIC-AVX512F-NEXT:    [[TMP19:%.*]] = icmp ne i512 [[TMP18]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP20:%.*]] = zext i1 [[TMP19]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP20]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 255) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length256(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length256(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length256(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length256(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length256(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length256(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length256(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length256(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length256(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length256(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length256(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 256) nounwind
+  ret i32 %m
+}
+
+define i1 @length256_eq(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length256_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length256_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length256_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length256_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length256_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length256_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 128
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i512, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = load i512, ptr [[TMP10]], align 1
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = xor i512 [[TMP11]], [[TMP12]]
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 192
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 192
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = load i512, ptr [[TMP14]], align 1
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = load i512, ptr [[TMP15]], align 1
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = xor i512 [[TMP16]], [[TMP17]]
+; X64-AVX512BW-NEXT:    [[TMP19:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP20:%.*]] = or i512 [[TMP13]], [[TMP18]]
+; X64-AVX512BW-NEXT:    [[TMP21:%.*]] = or i512 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-NEXT:    [[TMP22:%.*]] = icmp ne i512 [[TMP21]], 0
+; X64-AVX512BW-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length256_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length256_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 128
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i512, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = load i512, ptr [[TMP10]], align 1
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = xor i512 [[TMP11]], [[TMP12]]
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 192
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 192
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = load i512, ptr [[TMP14]], align 1
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = load i512, ptr [[TMP15]], align 1
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = xor i512 [[TMP16]], [[TMP17]]
+; X64-AVX512F-NEXT:    [[TMP19:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP20:%.*]] = or i512 [[TMP13]], [[TMP18]]
+; X64-AVX512F-NEXT:    [[TMP21:%.*]] = or i512 [[TMP19]], [[TMP20]]
+; X64-AVX512F-NEXT:    [[TMP22:%.*]] = icmp ne i512 [[TMP21]], 0
+; X64-AVX512F-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP22]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length256_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length256_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 128
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i512, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = load i512, ptr [[TMP10]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = xor i512 [[TMP11]], [[TMP12]]
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 192
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 192
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = load i512, ptr [[TMP14]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = load i512, ptr [[TMP15]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = xor i512 [[TMP16]], [[TMP17]]
+; X64-MIC-AVX512F-NEXT:    [[TMP19:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP20:%.*]] = or i512 [[TMP13]], [[TMP18]]
+; X64-MIC-AVX512F-NEXT:    [[TMP21:%.*]] = or i512 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX512F-NEXT:    [[TMP22:%.*]] = icmp ne i512 [[TMP21]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP22]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length256_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length256_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length256_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length256_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length256_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length256_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length256_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length256_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length256_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length256_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length256_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length256_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length256_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length256_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length256_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length256_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length256_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length256_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length256_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length256_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length256_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length256_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length256_eq_const(ptr %X) nounwind {
+; X64-LABEL: define i1 @length256_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 256) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length256_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 256) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length256_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 256) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length256_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 256) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length256_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 256) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length256_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = load i512, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 128), align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = xor i512 [[TMP9]], [[TMP10]]
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[X]], i64 192
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = load i512, ptr [[TMP12]], align 1
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 192), align 1
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = xor i512 [[TMP13]], [[TMP14]]
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = or i512 [[TMP11]], [[TMP15]]
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = or i512 [[TMP16]], [[TMP17]]
+; X64-AVX512BW-NEXT:    [[TMP19:%.*]] = icmp ne i512 [[TMP18]], 0
+; X64-AVX512BW-NEXT:    [[TMP20:%.*]] = zext i1 [[TMP19]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP20]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length256_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 256) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length256_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = load i512, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 128), align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = xor i512 [[TMP9]], [[TMP10]]
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[X]], i64 192
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = load i512, ptr [[TMP12]], align 1
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 192), align 1
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = xor i512 [[TMP13]], [[TMP14]]
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = or i512 [[TMP11]], [[TMP15]]
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = or i512 [[TMP16]], [[TMP17]]
+; X64-AVX512F-NEXT:    [[TMP19:%.*]] = icmp ne i512 [[TMP18]], 0
+; X64-AVX512F-NEXT:    [[TMP20:%.*]] = zext i1 [[TMP19]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP20]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length256_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 256) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length256_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = load i512, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 128), align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = xor i512 [[TMP9]], [[TMP10]]
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[X]], i64 192
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = load i512, ptr [[TMP12]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 192), align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = xor i512 [[TMP13]], [[TMP14]]
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = or i512 [[TMP11]], [[TMP15]]
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = or i512 [[TMP16]], [[TMP17]]
+; X64-MIC-AVX512F-NEXT:    [[TMP19:%.*]] = icmp ne i512 [[TMP18]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP20:%.*]] = zext i1 [[TMP19]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP20]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 256) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length384(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length384(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length384(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length384(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length384(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length384(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length384(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length384(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length384(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length384(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length384(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 384) nounwind
+  ret i32 %m
+}
+
+define i1 @length384_eq(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length384_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length384_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length384_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length384_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length384_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length384_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length384_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length384_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length384_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length384_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length384_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length384_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length384_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length384_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length384_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length384_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length384_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length384_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length384_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length384_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length384_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length384_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length384_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length384_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length384_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length384_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length384_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length384_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length384_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length384_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length384_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length384_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length384_eq_const(ptr %X) nounwind {
+; X64-LABEL: define i1 @length384_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 384) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length384_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 384) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length384_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 384) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length384_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 384) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length384_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 384) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length384_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 384) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length384_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 384) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length384_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 384) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length384_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 384) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length384_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 384) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 384) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length511(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length511(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length511(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length511(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length511(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length511(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length511(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length511(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length511(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length511(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length511(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 511) nounwind
+  ret i32 %m
+}
+
+define i1 @length511_eq(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length511_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length511_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length511_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length511_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length511_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length511_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length511_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length511_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length511_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length511_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length511_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length511_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length511_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length511_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length511_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length511_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length511_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length511_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length511_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length511_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length511_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length511_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length511_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length511_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length511_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length511_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length511_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length511_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length511_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length511_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length511_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length511_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length511_eq_const(ptr %X) nounwind {
+; X64-LABEL: define i1 @length511_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 511) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length511_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 511) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length511_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 511) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length511_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 511) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length511_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 511) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length511_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 511) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length511_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 511) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length511_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 511) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length511_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 511) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length511_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 511) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 511) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length512(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length512(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length512(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length512(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length512(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length512(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length512(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length512(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length512(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length512(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length512(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 512) nounwind
+  ret i32 %m
+}
+
+define i1 @length512_eq(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length512_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length512_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length512_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length512_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length512_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length512_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length512_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length512_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length512_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length512_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length512_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length512_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length512_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length512_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length512_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length512_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length512_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length512_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length512_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length512_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length512_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length512_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length512_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length512_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length512_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length512_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length512_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length512_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length512_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length512_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length512_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length512_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length512_eq_const(ptr %X) nounwind {
+; X64-LABEL: define i1 @length512_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 512) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length512_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 512) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length512_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 512) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length512_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 512) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length512_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 512) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length512_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 512) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length512_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 512) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length512_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 512) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length512_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 512) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length512_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 512) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 512) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; This checks that we do not do stupid things with huge sizes.
+define i32 @huge_length(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @huge_length(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @huge_length(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @huge_length(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @huge_length(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @huge_length(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @huge_length(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @huge_length(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @huge_length(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @huge_length(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @huge_length(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9223372036854775807) nounwind
+  ret i32 %m
+}
+
+define i1 @huge_length_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @huge_length_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @huge_length_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @huge_length_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @huge_length_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @huge_length_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @huge_length_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @huge_length_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @huge_length_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @huge_length_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @huge_length_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9223372036854775807) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; This checks non-constant sizes.
+define i32 @nonconst_length(ptr %X, ptr %Y, i64 %size) nounwind {
+; X64-LABEL: define i32 @nonconst_length(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @nonconst_length(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @nonconst_length(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @nonconst_length(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @nonconst_length(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @nonconst_length(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @nonconst_length(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @nonconst_length(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @nonconst_length(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @nonconst_length(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 %size) nounwind
+  ret i32 %m
+}
+
+define i1 @nonconst_length_eq(ptr %X, ptr %Y, i64 %size) nounwind {
+; X64-LABEL: define i1 @nonconst_length_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @nonconst_length_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @nonconst_length_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @nonconst_length_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @nonconst_length_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @nonconst_length_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @nonconst_length_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @nonconst_length_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @nonconst_length_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @nonconst_length_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 %size) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
diff --git a/llvm/test/CodeGen/X86/memcmp-constant.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-constant.ll
similarity index 50%
rename from llvm/test/CodeGen/X86/memcmp-constant.ll
rename to llvm/test/Transforms/ExpandMemCmp/X86/memcmp-constant.ll
index 2059b8f8040827..908c6b34183e57 100644
--- a/llvm/test/CodeGen/X86/memcmp-constant.ll
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-constant.ll
@@ -1,5 +1,7 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown               | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -passes=expand-memcmp -passes=expand-memcmp -memcmp-num-loads-per-block=1 -mtriple=x86_64-unknown-unknown         < %s | FileCheck %s --check-prefix=X64 --check-prefix=X64_1LD
+; RUN: opt -S -passes=expand-memcmp -memcmp-num-loads-per-block=2 -mtriple=x86_64-unknown-unknown         < %s | FileCheck %s --check-prefix=X64 --check-prefix=X64_2LD
+
 
 @.str1 = private constant [4 x i8] c"\00\00\00\00", align 1
 @.str2 = private constant [4 x i8] c"\ff\ff\ff\ff", align 1
@@ -7,49 +9,49 @@
 declare i32 @memcmp(ptr, ptr, i64)
 
 define i32 @length4_same() nounwind {
-; CHECK-LABEL: length4_same:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    retq
+; X64-LABEL: define i32 @length4_same(
+; X64-SAME: ) #[[ATTR0:[0-9]+]] {
+; X64-NEXT:    ret i32 0
+;
   %m = tail call i32 @memcmp(ptr @.str1, ptr @.str1, i64 4) nounwind
   ret i32 %m
 }
 
 define i1 @length4_same_lt() nounwind {
-; CHECK-LABEL: length4_same_lt:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    retq
+; X64-LABEL: define i1 @length4_same_lt(
+; X64-SAME: ) #[[ATTR0]] {
+; X64-NEXT:    ret i1 false
+;
   %m = tail call i32 @memcmp(ptr @.str1, ptr @.str1, i64 4) nounwind
   %c = icmp slt i32 %m, 0
   ret i1 %c
 }
 
 define i1 @length4_same_gt() nounwind {
-; CHECK-LABEL: length4_same_gt:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    retq
+; X64-LABEL: define i1 @length4_same_gt(
+; X64-SAME: ) #[[ATTR0]] {
+; X64-NEXT:    ret i1 false
+;
   %m = tail call i32 @memcmp(ptr @.str1, ptr @.str1, i64 4) nounwind
   %c = icmp sgt i32 %m, 0
   ret i1 %c
 }
 
 define i1 @length4_same_le() nounwind {
-; CHECK-LABEL: length4_same_le:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    retq
+; X64-LABEL: define i1 @length4_same_le(
+; X64-SAME: ) #[[ATTR0]] {
+; X64-NEXT:    ret i1 true
+;
   %m = tail call i32 @memcmp(ptr @.str1, ptr @.str1, i64 4) nounwind
   %c = icmp sle i32 %m, 0
   ret i1 %c
 }
 
 define i1 @length4_same_ge() nounwind {
-; CHECK-LABEL: length4_same_ge:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    retq
+; X64-LABEL: define i1 @length4_same_ge(
+; X64-SAME: ) #[[ATTR0]] {
+; X64-NEXT:    ret i1 true
+;
   %m = tail call i32 @memcmp(ptr @.str1, ptr @.str1, i64 4) nounwind
   %c = icmp sge i32 %m, 0
   ret i1 %c
@@ -57,52 +59,55 @@ define i1 @length4_same_ge() nounwind {
 
 
 define i32 @length4() nounwind {
-; CHECK-LABEL: length4:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl $-1, %eax
-; CHECK-NEXT:    retq
+; X64-LABEL: define i32 @length4(
+; X64-SAME: ) #[[ATTR0]] {
+; X64-NEXT:    ret i32 -1
+;
   %m = tail call i32 @memcmp(ptr @.str1, ptr @.str2, i64 4) nounwind
   ret i32 %m
 }
 
 define i1 @length4_lt() nounwind {
-; CHECK-LABEL: length4_lt:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    retq
+; X64-LABEL: define i1 @length4_lt(
+; X64-SAME: ) #[[ATTR0]] {
+; X64-NEXT:    ret i1 true
+;
   %m = tail call i32 @memcmp(ptr @.str1, ptr @.str2, i64 4) nounwind
   %c = icmp slt i32 %m, 0
   ret i1 %c
 }
 
 define i1 @length4_gt() nounwind {
-; CHECK-LABEL: length4_gt:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    retq
+; X64-LABEL: define i1 @length4_gt(
+; X64-SAME: ) #[[ATTR0]] {
+; X64-NEXT:    ret i1 false
+;
   %m = tail call i32 @memcmp(ptr @.str1, ptr @.str2, i64 4) nounwind
   %c = icmp sgt i32 %m, 0
   ret i1 %c
 }
 
 define i1 @length4_le() nounwind {
-; CHECK-LABEL: length4_le:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    retq
+; X64-LABEL: define i1 @length4_le(
+; X64-SAME: ) #[[ATTR0]] {
+; X64-NEXT:    ret i1 true
+;
   %m = tail call i32 @memcmp(ptr @.str1, ptr @.str2, i64 4) nounwind
   %c = icmp sle i32 %m, 0
   ret i1 %c
 }
 
 define i1 @length4_ge() nounwind {
-; CHECK-LABEL: length4_ge:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    retq
+; X64-LABEL: define i1 @length4_ge(
+; X64-SAME: ) #[[ATTR0]] {
+; X64-NEXT:    ret i1 false
+;
   %m = tail call i32 @memcmp(ptr @.str1, ptr @.str2, i64 4) nounwind
   %c = icmp sge i32 %m, 0
   ret i1 %c
 }
 
 
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; X64_1LD: {{.*}}
+; X64_2LD: {{.*}}
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-minsize-x32.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-minsize-x32.ll
new file mode 100644
index 00000000000000..edd70ddb445dcc
--- /dev/null
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-minsize-x32.ll
@@ -0,0 +1,493 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -passes=expand-memcmp -mtriple=i686-unknown-unknown -mattr=cmov < %s | FileCheck %s --check-prefix=X86
+; RUN: opt -S -passes=expand-memcmp -mtriple=i686-unknown-unknown -mattr=+sse2 < %s | FileCheck %s  --check-prefix=X86-SSE2
+
+; This tests codegen time inlining/optimization of memcmp
+; rdar://6480398
+
+ at .str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1
+
+declare dso_local i32 @memcmp(ptr, ptr, i32)
+
+define i32 @length2(ptr %X, ptr %Y) nounwind minsize {
+; X86-LABEL: define i32 @length2(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 2) #[[ATTR2:[0-9]+]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length2(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 2) #[[ATTR2:[0-9]+]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
+  ret i32 %m
+}
+
+define i1 @length2_eq(ptr %X, ptr %Y) nounwind minsize {
+; X86-LABEL: define i1 @length2_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 2) #[[ATTR2]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length2_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 2) #[[ATTR2]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_const(ptr %X) nounwind minsize {
+; X86-LABEL: define i1 @length2_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i32 2) #[[ATTR2]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length2_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i32 2) #[[ATTR2]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i32 2) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind minsize {
+; X86-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 2) #[[ATTR3:[0-9]+]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 2) #[[ATTR3:[0-9]+]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind nobuiltin
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length3(ptr %X, ptr %Y) nounwind minsize {
+; X86-LABEL: define i32 @length3(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 3) #[[ATTR2]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length3(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 3) #[[ATTR2]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind
+  ret i32 %m
+}
+
+define i1 @length3_eq(ptr %X, ptr %Y) nounwind minsize {
+; X86-LABEL: define i1 @length3_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 3) #[[ATTR2]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length3_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 3) #[[ATTR2]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length4(ptr %X, ptr %Y) nounwind minsize {
+; X86-LABEL: define i32 @length4(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 4) #[[ATTR2]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length4(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 4) #[[ATTR2]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
+  ret i32 %m
+}
+
+define i1 @length4_eq(ptr %X, ptr %Y) nounwind minsize {
+; X86-LABEL: define i1 @length4_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 4) #[[ATTR2]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length4_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 4) #[[ATTR2]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_eq_const(ptr %X) nounwind minsize {
+; X86-LABEL: define i1 @length4_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i32 4) #[[ATTR2]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length4_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i32 4) #[[ATTR2]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i32 4) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length5(ptr %X, ptr %Y) nounwind minsize {
+; X86-LABEL: define i32 @length5(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 5) #[[ATTR2]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length5(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 5) #[[ATTR2]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
+  ret i32 %m
+}
+
+define i1 @length5_eq(ptr %X, ptr %Y) nounwind minsize {
+; X86-LABEL: define i1 @length5_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 5) #[[ATTR2]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length5_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 5) #[[ATTR2]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length8(ptr %X, ptr %Y) nounwind minsize {
+; X86-LABEL: define i32 @length8(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 8) #[[ATTR2]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length8(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 8) #[[ATTR2]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 8) nounwind
+  ret i32 %m
+}
+
+define i1 @length8_eq(ptr %X, ptr %Y) nounwind minsize {
+; X86-LABEL: define i1 @length8_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 8) #[[ATTR2]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length8_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 8) #[[ATTR2]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 8) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length8_eq_const(ptr %X) nounwind minsize {
+; X86-LABEL: define i1 @length8_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 8) #[[ATTR2]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length8_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 8) #[[ATTR2]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 8) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length12_eq(ptr %X, ptr %Y) nounwind minsize {
+; X86-LABEL: define i1 @length12_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 12) #[[ATTR2]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length12_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 12) #[[ATTR2]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 12) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length12(ptr %X, ptr %Y) nounwind minsize {
+; X86-LABEL: define i32 @length12(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 12) #[[ATTR2]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length12(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 12) #[[ATTR2]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 12) nounwind
+  ret i32 %m
+}
+
+; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
+
+define i32 @length16(ptr %X, ptr %Y) nounwind minsize {
+; X86-LABEL: define i32 @length16(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR2]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length16(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR2]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 16) nounwind
+  ret i32 %m
+}
+
+define i1 @length16_eq(ptr %x, ptr %y) nounwind minsize {
+;
+; X86-LABEL: define i1 @length16_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR2]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length16_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR2]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 16) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_eq_const(ptr %X) nounwind minsize {
+;
+; X86-LABEL: define i1 @length16_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 16) #[[ATTR2]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length16_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 16) #[[ATTR2]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 16) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
+
+define i32 @length24(ptr %X, ptr %Y) nounwind minsize {
+; X86-LABEL: define i32 @length24(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR2]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length24(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR2]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 24) nounwind
+  ret i32 %m
+}
+
+define i1 @length24_eq(ptr %x, ptr %y) nounwind minsize {
+; X86-LABEL: define i1 @length24_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR2]]
+; X86-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length24_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR2]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 24) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_eq_const(ptr %X) nounwind minsize {
+; X86-LABEL: define i1 @length24_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 24) #[[ATTR2]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length24_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 24) #[[ATTR2]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 24) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length32(ptr %X, ptr %Y) nounwind minsize {
+; X86-LABEL: define i32 @length32(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR2]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length32(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR2]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 32) nounwind
+  ret i32 %m
+}
+
+; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
+
+define i1 @length32_eq(ptr %x, ptr %y) nounwind minsize {
+; X86-LABEL: define i1 @length32_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR2]]
+; X86-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length32_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR2]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_const(ptr %X) nounwind minsize {
+; X86-LABEL: define i1 @length32_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 32) #[[ATTR2]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length32_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 32) #[[ATTR2]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 32) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length64(ptr %X, ptr %Y) nounwind minsize {
+; X86-LABEL: define i32 @length64(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR2]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length64(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR2]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 64) nounwind
+  ret i32 %m
+}
+
+define i1 @length64_eq(ptr %x, ptr %y) nounwind minsize {
+; X86-LABEL: define i1 @length64_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR2]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length64_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR2]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 64) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_eq_const(ptr %X) nounwind minsize {
+; X86-LABEL: define i1 @length64_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 64) #[[ATTR2]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length64_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 64) #[[ATTR2]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 64) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-minsize.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-minsize.ll
new file mode 100644
index 00000000000000..431dc158962996
--- /dev/null
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-minsize.ll
@@ -0,0 +1,707 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -passes=expand-memcmp -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefix=X64
+; RUN: opt -S -passes=expand-memcmp -mtriple=x86_64-unknown-unknown -mattr=avx < %s | FileCheck %s --check-prefix=X64-AVX1
+; RUN: opt -S -passes=expand-memcmp -mtriple=x86_64-unknown-unknown -mattr=avx2 < %s | FileCheck %s --check-prefix=X64-AVX2
+
+; This tests codegen time inlining/optimization of memcmp
+; rdar://6480398
+
+ at .str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1
+
+declare dso_local i32 @memcmp(ptr, ptr, i64)
+
+define i32 @length2(ptr %X, ptr %Y) nounwind minsize {
+; X64-LABEL: define i32 @length2(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR1:[0-9]+]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length2(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR2:[0-9]+]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length2(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR2:[0-9]+]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+  ret i32 %m
+}
+
+define i1 @length2_eq(ptr %X, ptr %Y) nounwind minsize {
+; X64-LABEL: define i1 @length2_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR1]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length2_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR2]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length2_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR2]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_const(ptr %X) nounwind minsize {
+; X64-LABEL: define i1 @length2_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i64 2) #[[ATTR1]]
+; X64-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length2_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i64 2) #[[ATTR2]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length2_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i64 2) #[[ATTR2]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind minsize {
+; X64-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR2:[0-9]+]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR3:[0-9]+]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR3:[0-9]+]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind nobuiltin
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length3(ptr %X, ptr %Y) nounwind minsize {
+; X64-LABEL: define i32 @length3(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 3) #[[ATTR1]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length3(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 3) #[[ATTR2]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length3(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 3) #[[ATTR2]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
+  ret i32 %m
+}
+
+define i1 @length3_eq(ptr %X, ptr %Y) nounwind minsize {
+; X64-LABEL: define i1 @length3_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 3) #[[ATTR1]]
+; X64-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length3_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 3) #[[ATTR2]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length3_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 3) #[[ATTR2]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length4(ptr %X, ptr %Y) nounwind minsize {
+; X64-LABEL: define i32 @length4(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 4) #[[ATTR1]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length4(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 4) #[[ATTR2]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length4(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 4) #[[ATTR2]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  ret i32 %m
+}
+
+define i1 @length4_eq(ptr %X, ptr %Y) nounwind minsize {
+; X64-LABEL: define i1 @length4_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 4) #[[ATTR1]]
+; X64-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length4_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 4) #[[ATTR2]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length4_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 4) #[[ATTR2]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_eq_const(ptr %X) nounwind minsize {
+; X64-LABEL: define i1 @length4_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i64 4) #[[ATTR1]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length4_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i64 4) #[[ATTR2]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length4_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i64 4) #[[ATTR2]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i64 4) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length5(ptr %X, ptr %Y) nounwind minsize {
+; X64-LABEL: define i32 @length5(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 5) #[[ATTR1]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length5(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 5) #[[ATTR2]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length5(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 5) #[[ATTR2]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
+  ret i32 %m
+}
+
+define i1 @length5_eq(ptr %X, ptr %Y) nounwind minsize {
+; X64-LABEL: define i1 @length5_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 5) #[[ATTR1]]
+; X64-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length5_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 5) #[[ATTR2]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length5_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 5) #[[ATTR2]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length8(ptr %X, ptr %Y) nounwind minsize {
+; X64-LABEL: define i32 @length8(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 8) #[[ATTR1]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length8(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 8) #[[ATTR2]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length8(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 8) #[[ATTR2]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
+  ret i32 %m
+}
+
+define i1 @length8_eq(ptr %X, ptr %Y) nounwind minsize {
+; X64-LABEL: define i1 @length8_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 8) #[[ATTR1]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length8_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 8) #[[ATTR2]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length8_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 8) #[[ATTR2]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length8_eq_const(ptr %X) nounwind minsize {
+; X64-LABEL: define i1 @length8_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 8) #[[ATTR1]]
+; X64-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length8_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 8) #[[ATTR2]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length8_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 8) #[[ATTR2]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 8) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length12_eq(ptr %X, ptr %Y) nounwind minsize {
+; X64-LABEL: define i1 @length12_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 12) #[[ATTR1]]
+; X64-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length12_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 12) #[[ATTR2]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length12_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 12) #[[ATTR2]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length12(ptr %X, ptr %Y) nounwind minsize {
+; X64-LABEL: define i32 @length12(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 12) #[[ATTR1]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length12(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 12) #[[ATTR2]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length12(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 12) #[[ATTR2]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
+  ret i32 %m
+}
+
+; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
+
+define i32 @length16(ptr %X, ptr %Y) nounwind minsize {
+;
+; X64-LABEL: define i32 @length16(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 16) #[[ATTR1]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length16(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 16) #[[ATTR2]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length16(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 16) #[[ATTR2]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 16) nounwind
+  ret i32 %m
+}
+
+define i1 @length16_eq(ptr %x, ptr %y) nounwind minsize {
+; X64-SSE2-LABEL: length16_eq:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    movdqu (%rsi), %xmm0
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm1
+; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; X64-SSE2-NEXT:    pmovmskb %xmm1, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    setne %al
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX-LABEL: length16_eq:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    setne %al
+; X64-AVX-NEXT:    retq
+; X64-LABEL: define i1 @length16_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 16) #[[ATTR1]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length16_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 16) #[[ATTR2]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length16_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 16) #[[ATTR2]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_eq_const(ptr %X) nounwind minsize {
+; X64-SSE2-LABEL: length16_eq_const:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    sete %al
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX-LABEL: length16_eq_const:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    sete %al
+; X64-AVX-NEXT:    retq
+; X64-LABEL: define i1 @length16_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 16) #[[ATTR1]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length16_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 16) #[[ATTR2]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length16_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 16) #[[ATTR2]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 16) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
+
+define i32 @length24(ptr %X, ptr %Y) nounwind minsize {
+; X64-LABEL: define i32 @length24(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 24) #[[ATTR1]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length24(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 24) #[[ATTR2]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length24(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 24) #[[ATTR2]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 24) nounwind
+  ret i32 %m
+}
+
+define i1 @length24_eq(ptr %x, ptr %y) nounwind minsize {
+; X64-LABEL: define i1 @length24_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 24) #[[ATTR1]]
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length24_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 24) #[[ATTR2]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length24_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 24) #[[ATTR2]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_eq_const(ptr %X) nounwind minsize {
+; X64-LABEL: define i1 @length24_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 24) #[[ATTR1]]
+; X64-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length24_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 24) #[[ATTR2]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length24_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 24) #[[ATTR2]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 24) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length32(ptr %X, ptr %Y) nounwind minsize {
+; X64-LABEL: define i32 @length32(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 32) #[[ATTR1]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length32(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 32) #[[ATTR2]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length32(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 32) #[[ATTR2]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 32) nounwind
+  ret i32 %m
+}
+
+; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
+
+define i1 @length32_eq(ptr %x, ptr %y) nounwind minsize {
+; X64-SSE2-LABEL: length32_eq:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    pushq %rax
+; X64-SSE2-NEXT:    pushq $32
+; X64-SSE2-NEXT:    popq %rdx
+; X64-SSE2-NEXT:    callq memcmp
+; X64-SSE2-NEXT:    testl %eax, %eax
+; X64-SSE2-NEXT:    sete %al
+; X64-SSE2-NEXT:    popq %rcx
+; X64-SSE2-NEXT:    retq
+;
+; X64-LABEL: define i1 @length32_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 32) #[[ATTR1]]
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length32_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 32) #[[ATTR2]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length32_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 32) #[[ATTR2]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_const(ptr %X) nounwind minsize {
+; X64-SSE2-LABEL: length32_eq_const:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    pushq %rax
+; X64-SSE2-NEXT:    pushq $32
+; X64-SSE2-NEXT:    popq %rdx
+; X64-SSE2-NEXT:    movl $.L.str, %esi
+; X64-SSE2-NEXT:    callq memcmp
+; X64-SSE2-NEXT:    testl %eax, %eax
+; X64-SSE2-NEXT:    setne %al
+; X64-SSE2-NEXT:    popq %rcx
+; X64-SSE2-NEXT:    retq
+;
+; X64-LABEL: define i1 @length32_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 32) #[[ATTR1]]
+; X64-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length32_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 32) #[[ATTR2]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length32_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 32) #[[ATTR2]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 32) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length64(ptr %X, ptr %Y) nounwind minsize {
+; X64-LABEL: define i32 @length64(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR1]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length64(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR2]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length64(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR2]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 64) nounwind
+  ret i32 %m
+}
+
+define i1 @length64_eq(ptr %x, ptr %y) nounwind minsize {
+; X64-LABEL: define i1 @length64_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR1]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length64_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR2]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length64_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR2]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_eq_const(ptr %X) nounwind minsize {
+; X64-LABEL: define i1 @length64_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 64) #[[ATTR1]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length64_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 64) #[[ATTR2]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length64_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 64) #[[ATTR2]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 64) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-more-load-pairs-x32.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-more-load-pairs-x32.ll
new file mode 100644
index 00000000000000..abdadb14086c20
--- /dev/null
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-more-load-pairs-x32.ll
@@ -0,0 +1,6203 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; NOTE: This is a copy of llvm/test/CodeGen/X86/memcmp.ll with more load pairs. Please keep it that way.
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 -mtriple=i686-unknown-unknown -mattr=cmov     < %s | FileCheck %s --check-prefixes=X86
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 -mtriple=i686-unknown-unknown -mattr=+sse     < %s | FileCheck %s --check-prefixes=X86-SSE1
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 -mtriple=i686-unknown-unknown -mattr=+sse2    < %s | FileCheck %s --check-prefixes=X86-SSE2
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 -mtriple=i686-unknown-unknown -mattr=+sse4.1  < %s | FileCheck %s --check-prefixes=X86-SSE41
+
+; This tests codegen time inlining/optimization of memcmp
+; rdar://6480398
+
+ at .str = private constant [513 x i8] c"01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901\00", align 1
+
+declare dso_local i32 @memcmp(ptr, ptr, i32)
+
+define i32 @length0(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length0(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X86-NEXT:    ret i32 0
+;
+; X86-SSE1-LABEL: define i32 @length0(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X86-SSE1-NEXT:    ret i32 0
+;
+; X86-SSE2-LABEL: define i32 @length0(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X86-SSE2-NEXT:    ret i32 0
+;
+; X86-SSE41-LABEL: define i32 @length0(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X86-SSE41-NEXT:    ret i32 0
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 0) nounwind
+  ret i32 %m
+  }
+
+define i1 @length0_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length0_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    ret i1 true
+;
+; X86-SSE1-LABEL: define i1 @length0_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    ret i1 true
+;
+; X86-SSE2-LABEL: define i1 @length0_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    ret i1 true
+;
+; X86-SSE41-LABEL: define i1 @length0_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    ret i1 true
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 0) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length0_lt(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length0_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    ret i1 false
+;
+; X86-SSE1-LABEL: define i1 @length0_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    ret i1 false
+;
+; X86-SSE2-LABEL: define i1 @length0_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    ret i1 false
+;
+; X86-SSE41-LABEL: define i1 @length0_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    ret i1 false
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 0) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length2(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length2(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    ret i32 [[TMP7]]
+;
+; X86-SSE1-LABEL: define i32 @length2(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-SSE1-NEXT:    ret i32 [[TMP7]]
+;
+; X86-SSE2-LABEL: define i32 @length2(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    ret i32 [[TMP7]]
+;
+; X86-SSE41-LABEL: define i32 @length2(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-SSE41-NEXT:    ret i32 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
+  ret i32 %m
+}
+
+define i1 @length2_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length2_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length2_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length2_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length2_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length2_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length2_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length2_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length2_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_gt(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length2_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length2_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length2_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length2_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length2_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X86-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-NEXT:    ret i1 [[TMP2]]
+;
+; X86-SSE1-LABEL: define i1 @length2_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-SSE1-NEXT:    ret i1 [[TMP2]]
+;
+; X86-SSE2-LABEL: define i1 @length2_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP2]]
+;
+; X86-SSE41-LABEL: define i1 @length2_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP2]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 2) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 2) #[[ATTR4:[0-9]+]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 2) #[[ATTR4:[0-9]+]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 2) #[[ATTR4:[0-9]+]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 2) #[[ATTR4:[0-9]+]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind nobuiltin
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length3(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length3(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    br label [[LOADBB:%.*]]
+; X86:       res_block:
+; X86-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86:       loadbb:
+; X86-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X86-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X86-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86:       loadbb1:
+; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    br label [[ENDBLOCK]]
+; X86:       endblock:
+; X86-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE1-LABEL: define i32 @length3(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE1:       res_block:
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE1:       loadbb:
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X86-SSE1-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X86-SSE1-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86-SSE1:       loadbb1:
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-SSE1-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-SSE1-NEXT:    br label [[ENDBLOCK]]
+; X86-SSE1:       endblock:
+; X86-SSE1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE2-LABEL: define i32 @length3(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE2:       res_block:
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE2:       loadbb:
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X86-SSE2-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86-SSE2:       loadbb1:
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    br label [[ENDBLOCK]]
+; X86-SSE2:       endblock:
+; X86-SSE2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE41-LABEL: define i32 @length3(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE41:       res_block:
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE41:       loadbb:
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X86-SSE41-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X86-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86-SSE41:       loadbb1:
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-SSE41-NEXT:    br label [[ENDBLOCK]]
+; X86-SSE41:       endblock:
+; X86-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind
+  ret i32 %m
+}
+
+define i1 @length3_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length3_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X86-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X86-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X86-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X86-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X86-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-NEXT:    ret i1 [[TMP12]]
+;
+; X86-SSE1-LABEL: define i1 @length3_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X86-SSE1-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X86-SSE1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-SSE1-NEXT:    ret i1 [[TMP12]]
+;
+; X86-SSE2-LABEL: define i1 @length3_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP12]]
+;
+; X86-SSE41-LABEL: define i1 @length3_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP12]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length4(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length4(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X86-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X86-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X86-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X86-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X86-NEXT:    ret i32 [[TMP9]]
+;
+; X86-SSE1-LABEL: define i32 @length4(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X86-SSE1-NEXT:    ret i32 [[TMP9]]
+;
+; X86-SSE2-LABEL: define i32 @length4(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X86-SSE2-NEXT:    ret i32 [[TMP9]]
+;
+; X86-SSE41-LABEL: define i32 @length4(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X86-SSE41-NEXT:    ret i32 [[TMP9]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
+  ret i32 %m
+}
+
+define i1 @length4_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length4_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-NEXT:    ret i1 [[TMP3]]
+;
+; X86-SSE1-LABEL: define i1 @length4_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE1-NEXT:    ret i1 [[TMP3]]
+;
+; X86-SSE2-LABEL: define i1 @length4_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP3]]
+;
+; X86-SSE41-LABEL: define i1 @length4_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP3]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_lt(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length4_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X86-NEXT:    ret i1 [[TMP5]]
+;
+; X86-SSE1-LABEL: define i1 @length4_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X86-SSE1-NEXT:    ret i1 [[TMP5]]
+;
+; X86-SSE2-LABEL: define i1 @length4_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X86-SSE2-NEXT:    ret i1 [[TMP5]]
+;
+; X86-SSE41-LABEL: define i1 @length4_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X86-SSE41-NEXT:    ret i1 [[TMP5]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_gt(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length4_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X86-NEXT:    ret i1 [[TMP5]]
+;
+; X86-SSE1-LABEL: define i1 @length4_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X86-SSE1-NEXT:    ret i1 [[TMP5]]
+;
+; X86-SSE2-LABEL: define i1 @length4_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X86-SSE2-NEXT:    ret i1 [[TMP5]]
+;
+; X86-SSE41-LABEL: define i1 @length4_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X86-SSE41-NEXT:    ret i1 [[TMP5]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length4_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X86-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length4_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length4_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length4_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 4) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length5(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length5(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    br label [[LOADBB:%.*]]
+; X86:       res_block:
+; X86-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86:       loadbb:
+; X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86:       loadbb1:
+; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    br label [[ENDBLOCK]]
+; X86:       endblock:
+; X86-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE1-LABEL: define i32 @length5(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE1:       res_block:
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE1:       loadbb:
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE1-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86-SSE1:       loadbb1:
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-SSE1-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-SSE1-NEXT:    br label [[ENDBLOCK]]
+; X86-SSE1:       endblock:
+; X86-SSE1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE2-LABEL: define i32 @length5(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE2:       res_block:
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE2:       loadbb:
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86-SSE2:       loadbb1:
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    br label [[ENDBLOCK]]
+; X86-SSE2:       endblock:
+; X86-SSE2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE41-LABEL: define i32 @length5(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE41:       res_block:
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE41:       loadbb:
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE41-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86-SSE41:       loadbb1:
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-SSE41-NEXT:    br label [[ENDBLOCK]]
+; X86-SSE41:       endblock:
+; X86-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
+  ret i32 %m
+}
+
+define i1 @length5_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length5_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X86-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X86-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X86-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X86-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X86-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-NEXT:    ret i1 [[TMP12]]
+;
+; X86-SSE1-LABEL: define i1 @length5_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X86-SSE1-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X86-SSE1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-SSE1-NEXT:    ret i1 [[TMP12]]
+;
+; X86-SSE2-LABEL: define i1 @length5_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP12]]
+;
+; X86-SSE41-LABEL: define i1 @length5_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP12]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length5_lt(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length5_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    br label [[LOADBB:%.*]]
+; X86:       res_block:
+; X86-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86:       loadbb:
+; X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86:       loadbb1:
+; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    br label [[ENDBLOCK]]
+; X86:       endblock:
+; X86-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length5_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE1:       res_block:
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE1:       loadbb:
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE1-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86-SSE1:       loadbb1:
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-SSE1-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-SSE1-NEXT:    br label [[ENDBLOCK]]
+; X86-SSE1:       endblock:
+; X86-SSE1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length5_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE2:       res_block:
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE2:       loadbb:
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86-SSE2:       loadbb1:
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    br label [[ENDBLOCK]]
+; X86-SSE2:       endblock:
+; X86-SSE2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length5_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE41:       res_block:
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE41:       loadbb:
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE41-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86-SSE41:       loadbb1:
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-SSE41-NEXT:    br label [[ENDBLOCK]]
+; X86-SSE41:       endblock:
+; X86-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length7(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length7(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    br label [[LOADBB:%.*]]
+; X86:       res_block:
+; X86-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86:       loadbb:
+; X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86:       loadbb1:
+; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86:       endblock:
+; X86-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE1-LABEL: define i32 @length7(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE1:       res_block:
+; X86-SSE1-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-SSE1-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE1:       loadbb:
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE1:       loadbb1:
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE1-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE1-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE1:       endblock:
+; X86-SSE1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE2-LABEL: define i32 @length7(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE2:       res_block:
+; X86-SSE2-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-SSE2-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE2:       loadbb:
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE2:       loadbb1:
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE2-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE2:       endblock:
+; X86-SSE2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE41-LABEL: define i32 @length7(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE41:       res_block:
+; X86-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE41:       loadbb:
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE41-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE41:       loadbb1:
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE41-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE41-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE41:       endblock:
+; X86-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 7) nounwind
+  ret i32 %m
+}
+
+define i1 @length7_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length7_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X86-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-NEXT:    ret i1 [[TMP10]]
+;
+; X86-SSE1-LABEL: define i1 @length7_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE1-NEXT:    ret i1 [[TMP10]]
+;
+; X86-SSE2-LABEL: define i1 @length7_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP10]]
+;
+; X86-SSE41-LABEL: define i1 @length7_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP10]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 7) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length7_lt(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length7_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    br label [[LOADBB:%.*]]
+; X86:       res_block:
+; X86-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86:       loadbb:
+; X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86:       loadbb1:
+; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86:       endblock:
+; X86-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length7_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE1:       res_block:
+; X86-SSE1-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-SSE1-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE1:       loadbb:
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE1:       loadbb1:
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE1-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE1-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE1:       endblock:
+; X86-SSE1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length7_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE2:       res_block:
+; X86-SSE2-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-SSE2-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE2:       loadbb:
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE2:       loadbb1:
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE2-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE2:       endblock:
+; X86-SSE2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length7_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE41:       res_block:
+; X86-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE41:       loadbb:
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE41-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE41:       loadbb1:
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE41-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE41-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE41:       endblock:
+; X86-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 7) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length8(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length8(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    br label [[LOADBB:%.*]]
+; X86:       res_block:
+; X86-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86:       loadbb:
+; X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86:       loadbb1:
+; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86:       endblock:
+; X86-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE1-LABEL: define i32 @length8(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE1:       res_block:
+; X86-SSE1-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-SSE1-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE1:       loadbb:
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE1:       loadbb1:
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE1-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE1-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE1:       endblock:
+; X86-SSE1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE2-LABEL: define i32 @length8(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE2:       res_block:
+; X86-SSE2-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-SSE2-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE2:       loadbb:
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE2:       loadbb1:
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE2-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE2:       endblock:
+; X86-SSE2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE41-LABEL: define i32 @length8(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE41:       res_block:
+; X86-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE41:       loadbb:
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE41-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE41:       loadbb1:
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE41-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE41-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE41:       endblock:
+; X86-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 8) nounwind
+  ret i32 %m
+}
+
+define i1 @length8_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length8_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X86-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length8_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length8_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length8_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 8) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length8_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length8_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = xor i32 [[TMP1]], 858927408
+; X86-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 1
+; X86-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP4]], 926299444
+; X86-NEXT:    [[TMP6:%.*]] = or i32 [[TMP2]], [[TMP5]]
+; X86-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[TMP6]], 0
+; X86-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-NEXT:    ret i1 [[TMP7]]
+;
+; X86-SSE1-LABEL: define i1 @length8_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = xor i32 [[TMP1]], 858927408
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 1
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP4]], 926299444
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = or i32 [[TMP2]], [[TMP5]]
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[TMP6]], 0
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE1-NEXT:    ret i1 [[TMP7]]
+;
+; X86-SSE2-LABEL: define i1 @length8_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = xor i32 [[TMP1]], 858927408
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 1
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP4]], 926299444
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = or i32 [[TMP2]], [[TMP5]]
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[TMP6]], 0
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP7]]
+;
+; X86-SSE41-LABEL: define i1 @length8_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = xor i32 [[TMP1]], 858927408
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 1
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP4]], 926299444
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = or i32 [[TMP2]], [[TMP5]]
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[TMP6]], 0
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 8) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length9_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length9_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12:%.*]] = load i8, ptr [[TMP10]], align 1
+; X86-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-NEXT:    [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
+; X86-NEXT:    [[TMP15:%.*]] = xor i32 [[TMP13]], [[TMP14]]
+; X86-NEXT:    [[TMP16:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-NEXT:    [[TMP17:%.*]] = or i32 [[TMP16]], [[TMP15]]
+; X86-NEXT:    [[TMP18:%.*]] = icmp ne i32 [[TMP17]], 0
+; X86-NEXT:    [[TMP19:%.*]] = zext i1 [[TMP18]] to i32
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP19]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length9_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12:%.*]] = load i8, ptr [[TMP10]], align 1
+; X86-SSE1-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
+; X86-SSE1-NEXT:    [[TMP15:%.*]] = xor i32 [[TMP13]], [[TMP14]]
+; X86-SSE1-NEXT:    [[TMP16:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE1-NEXT:    [[TMP17:%.*]] = or i32 [[TMP16]], [[TMP15]]
+; X86-SSE1-NEXT:    [[TMP18:%.*]] = icmp ne i32 [[TMP17]], 0
+; X86-SSE1-NEXT:    [[TMP19:%.*]] = zext i1 [[TMP18]] to i32
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP19]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length9_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = load i8, ptr [[TMP10]], align 1
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
+; X86-SSE2-NEXT:    [[TMP15:%.*]] = xor i32 [[TMP13]], [[TMP14]]
+; X86-SSE2-NEXT:    [[TMP16:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP17:%.*]] = or i32 [[TMP16]], [[TMP15]]
+; X86-SSE2-NEXT:    [[TMP18:%.*]] = icmp ne i32 [[TMP17]], 0
+; X86-SSE2-NEXT:    [[TMP19:%.*]] = zext i1 [[TMP18]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP19]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length9_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = load i8, ptr [[TMP10]], align 1
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
+; X86-SSE41-NEXT:    [[TMP15:%.*]] = xor i32 [[TMP13]], [[TMP14]]
+; X86-SSE41-NEXT:    [[TMP16:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP17:%.*]] = or i32 [[TMP16]], [[TMP15]]
+; X86-SSE41-NEXT:    [[TMP18:%.*]] = icmp ne i32 [[TMP17]], 0
+; X86-SSE41-NEXT:    [[TMP19:%.*]] = zext i1 [[TMP18]] to i32
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP19]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 9) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length10_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length10_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12:%.*]] = load i16, ptr [[TMP10]], align 1
+; X86-NEXT:    [[TMP13:%.*]] = zext i16 [[TMP11]] to i32
+; X86-NEXT:    [[TMP14:%.*]] = zext i16 [[TMP12]] to i32
+; X86-NEXT:    [[TMP15:%.*]] = xor i32 [[TMP13]], [[TMP14]]
+; X86-NEXT:    [[TMP16:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-NEXT:    [[TMP17:%.*]] = or i32 [[TMP16]], [[TMP15]]
+; X86-NEXT:    [[TMP18:%.*]] = icmp ne i32 [[TMP17]], 0
+; X86-NEXT:    [[TMP19:%.*]] = zext i1 [[TMP18]] to i32
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP19]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length10_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12:%.*]] = load i16, ptr [[TMP10]], align 1
+; X86-SSE1-NEXT:    [[TMP13:%.*]] = zext i16 [[TMP11]] to i32
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = zext i16 [[TMP12]] to i32
+; X86-SSE1-NEXT:    [[TMP15:%.*]] = xor i32 [[TMP13]], [[TMP14]]
+; X86-SSE1-NEXT:    [[TMP16:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE1-NEXT:    [[TMP17:%.*]] = or i32 [[TMP16]], [[TMP15]]
+; X86-SSE1-NEXT:    [[TMP18:%.*]] = icmp ne i32 [[TMP17]], 0
+; X86-SSE1-NEXT:    [[TMP19:%.*]] = zext i1 [[TMP18]] to i32
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP19]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length10_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = load i16, ptr [[TMP10]], align 1
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = zext i16 [[TMP11]] to i32
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = zext i16 [[TMP12]] to i32
+; X86-SSE2-NEXT:    [[TMP15:%.*]] = xor i32 [[TMP13]], [[TMP14]]
+; X86-SSE2-NEXT:    [[TMP16:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP17:%.*]] = or i32 [[TMP16]], [[TMP15]]
+; X86-SSE2-NEXT:    [[TMP18:%.*]] = icmp ne i32 [[TMP17]], 0
+; X86-SSE2-NEXT:    [[TMP19:%.*]] = zext i1 [[TMP18]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP19]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length10_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = load i16, ptr [[TMP10]], align 1
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = zext i16 [[TMP11]] to i32
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = zext i16 [[TMP12]] to i32
+; X86-SSE41-NEXT:    [[TMP15:%.*]] = xor i32 [[TMP13]], [[TMP14]]
+; X86-SSE41-NEXT:    [[TMP16:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP17:%.*]] = or i32 [[TMP16]], [[TMP15]]
+; X86-SSE41-NEXT:    [[TMP18:%.*]] = icmp ne i32 [[TMP17]], 0
+; X86-SSE41-NEXT:    [[TMP19:%.*]] = zext i1 [[TMP18]] to i32
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP19]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 10) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length11_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length11_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X86-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X86-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; X86-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP11]], [[TMP12]]
+; X86-NEXT:    [[TMP14:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-NEXT:    [[TMP15:%.*]] = or i32 [[TMP14]], [[TMP13]]
+; X86-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+; X86-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP17]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length11_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; X86-SSE1-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP11]], [[TMP12]]
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE1-NEXT:    [[TMP15:%.*]] = or i32 [[TMP14]], [[TMP13]]
+; X86-SSE1-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+; X86-SSE1-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP17]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length11_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP11]], [[TMP12]]
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP15:%.*]] = or i32 [[TMP14]], [[TMP13]]
+; X86-SSE2-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+; X86-SSE2-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP17]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length11_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP11]], [[TMP12]]
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP15:%.*]] = or i32 [[TMP14]], [[TMP13]]
+; X86-SSE41-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+; X86-SSE41-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP17]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 11) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length12_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length12_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; X86-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP11]], [[TMP12]]
+; X86-NEXT:    [[TMP14:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-NEXT:    [[TMP15:%.*]] = or i32 [[TMP14]], [[TMP13]]
+; X86-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+; X86-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X86-NEXT:    ret i1 [[TMP16]]
+;
+; X86-SSE1-LABEL: define i1 @length12_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; X86-SSE1-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP11]], [[TMP12]]
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE1-NEXT:    [[TMP15:%.*]] = or i32 [[TMP14]], [[TMP13]]
+; X86-SSE1-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+; X86-SSE1-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X86-SSE1-NEXT:    ret i1 [[TMP16]]
+;
+; X86-SSE2-LABEL: define i1 @length12_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP11]], [[TMP12]]
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP15:%.*]] = or i32 [[TMP14]], [[TMP13]]
+; X86-SSE2-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+; X86-SSE2-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP16]]
+;
+; X86-SSE41-LABEL: define i1 @length12_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP11]], [[TMP12]]
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP15:%.*]] = or i32 [[TMP14]], [[TMP13]]
+; X86-SSE41-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+; X86-SSE41-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP16]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 12) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length12(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length12(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    br label [[LOADBB:%.*]]
+; X86:       res_block:
+; X86-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X86-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X86-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86:       loadbb:
+; X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86:       loadbb1:
+; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X86:       loadbb2:
+; X86-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; X86-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 1
+; X86-NEXT:    [[TMP19]] = call i32 @llvm.bswap.i32(i32 [[TMP17]])
+; X86-NEXT:    [[TMP20]] = call i32 @llvm.bswap.i32(i32 [[TMP18]])
+; X86-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], [[TMP20]]
+; X86-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86:       endblock:
+; X86-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE1-LABEL: define i32 @length12(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE1:       res_block:
+; X86-SSE1-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X86-SSE1-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE1:       loadbb:
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE1:       loadbb1:
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE1-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE1-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X86-SSE1:       loadbb2:
+; X86-SSE1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; X86-SSE1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 1
+; X86-SSE1-NEXT:    [[TMP19]] = call i32 @llvm.bswap.i32(i32 [[TMP17]])
+; X86-SSE1-NEXT:    [[TMP20]] = call i32 @llvm.bswap.i32(i32 [[TMP18]])
+; X86-SSE1-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], [[TMP20]]
+; X86-SSE1-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE1:       endblock:
+; X86-SSE1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE2-LABEL: define i32 @length12(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE2:       res_block:
+; X86-SSE2-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X86-SSE2-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE2:       loadbb:
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE2:       loadbb1:
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE2-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X86-SSE2:       loadbb2:
+; X86-SSE2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; X86-SSE2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 1
+; X86-SSE2-NEXT:    [[TMP19]] = call i32 @llvm.bswap.i32(i32 [[TMP17]])
+; X86-SSE2-NEXT:    [[TMP20]] = call i32 @llvm.bswap.i32(i32 [[TMP18]])
+; X86-SSE2-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], [[TMP20]]
+; X86-SSE2-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE2:       endblock:
+; X86-SSE2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE41-LABEL: define i32 @length12(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE41:       res_block:
+; X86-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X86-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE41:       loadbb:
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE41-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE41:       loadbb1:
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE41-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE41-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X86-SSE41:       loadbb2:
+; X86-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE41-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE41-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; X86-SSE41-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 1
+; X86-SSE41-NEXT:    [[TMP19]] = call i32 @llvm.bswap.i32(i32 [[TMP17]])
+; X86-SSE41-NEXT:    [[TMP20]] = call i32 @llvm.bswap.i32(i32 [[TMP18]])
+; X86-SSE41-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], [[TMP20]]
+; X86-SSE41-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE41:       endblock:
+; X86-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 12) nounwind
+  ret i32 %m
+}
+
+define i1 @length13_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length13_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; X86-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP11]], [[TMP12]]
+; X86-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 12
+; X86-NEXT:    [[TMP16:%.*]] = load i8, ptr [[TMP14]], align 1
+; X86-NEXT:    [[TMP17:%.*]] = load i8, ptr [[TMP15]], align 1
+; X86-NEXT:    [[TMP18:%.*]] = zext i8 [[TMP16]] to i32
+; X86-NEXT:    [[TMP19:%.*]] = zext i8 [[TMP17]] to i32
+; X86-NEXT:    [[TMP20:%.*]] = xor i32 [[TMP18]], [[TMP19]]
+; X86-NEXT:    [[TMP21:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-NEXT:    [[TMP22:%.*]] = or i32 [[TMP13]], [[TMP20]]
+; X86-NEXT:    [[TMP23:%.*]] = or i32 [[TMP21]], [[TMP22]]
+; X86-NEXT:    [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0
+; X86-NEXT:    [[TMP25:%.*]] = zext i1 [[TMP24]] to i32
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP25]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length13_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; X86-SSE1-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP11]], [[TMP12]]
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-SSE1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 12
+; X86-SSE1-NEXT:    [[TMP16:%.*]] = load i8, ptr [[TMP14]], align 1
+; X86-SSE1-NEXT:    [[TMP17:%.*]] = load i8, ptr [[TMP15]], align 1
+; X86-SSE1-NEXT:    [[TMP18:%.*]] = zext i8 [[TMP16]] to i32
+; X86-SSE1-NEXT:    [[TMP19:%.*]] = zext i8 [[TMP17]] to i32
+; X86-SSE1-NEXT:    [[TMP20:%.*]] = xor i32 [[TMP18]], [[TMP19]]
+; X86-SSE1-NEXT:    [[TMP21:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE1-NEXT:    [[TMP22:%.*]] = or i32 [[TMP13]], [[TMP20]]
+; X86-SSE1-NEXT:    [[TMP23:%.*]] = or i32 [[TMP21]], [[TMP22]]
+; X86-SSE1-NEXT:    [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0
+; X86-SSE1-NEXT:    [[TMP25:%.*]] = zext i1 [[TMP24]] to i32
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP25]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length13_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP11]], [[TMP12]]
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-SSE2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 12
+; X86-SSE2-NEXT:    [[TMP16:%.*]] = load i8, ptr [[TMP14]], align 1
+; X86-SSE2-NEXT:    [[TMP17:%.*]] = load i8, ptr [[TMP15]], align 1
+; X86-SSE2-NEXT:    [[TMP18:%.*]] = zext i8 [[TMP16]] to i32
+; X86-SSE2-NEXT:    [[TMP19:%.*]] = zext i8 [[TMP17]] to i32
+; X86-SSE2-NEXT:    [[TMP20:%.*]] = xor i32 [[TMP18]], [[TMP19]]
+; X86-SSE2-NEXT:    [[TMP21:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP22:%.*]] = or i32 [[TMP13]], [[TMP20]]
+; X86-SSE2-NEXT:    [[TMP23:%.*]] = or i32 [[TMP21]], [[TMP22]]
+; X86-SSE2-NEXT:    [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0
+; X86-SSE2-NEXT:    [[TMP25:%.*]] = zext i1 [[TMP24]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP25]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length13_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP11]], [[TMP12]]
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 12
+; X86-SSE41-NEXT:    [[TMP16:%.*]] = load i8, ptr [[TMP14]], align 1
+; X86-SSE41-NEXT:    [[TMP17:%.*]] = load i8, ptr [[TMP15]], align 1
+; X86-SSE41-NEXT:    [[TMP18:%.*]] = zext i8 [[TMP16]] to i32
+; X86-SSE41-NEXT:    [[TMP19:%.*]] = zext i8 [[TMP17]] to i32
+; X86-SSE41-NEXT:    [[TMP20:%.*]] = xor i32 [[TMP18]], [[TMP19]]
+; X86-SSE41-NEXT:    [[TMP21:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP22:%.*]] = or i32 [[TMP13]], [[TMP20]]
+; X86-SSE41-NEXT:    [[TMP23:%.*]] = or i32 [[TMP21]], [[TMP22]]
+; X86-SSE41-NEXT:    [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0
+; X86-SSE41-NEXT:    [[TMP25:%.*]] = zext i1 [[TMP24]] to i32
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP25]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 13) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length14_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length14_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; X86-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP11]], [[TMP12]]
+; X86-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 12
+; X86-NEXT:    [[TMP16:%.*]] = load i16, ptr [[TMP14]], align 1
+; X86-NEXT:    [[TMP17:%.*]] = load i16, ptr [[TMP15]], align 1
+; X86-NEXT:    [[TMP18:%.*]] = zext i16 [[TMP16]] to i32
+; X86-NEXT:    [[TMP19:%.*]] = zext i16 [[TMP17]] to i32
+; X86-NEXT:    [[TMP20:%.*]] = xor i32 [[TMP18]], [[TMP19]]
+; X86-NEXT:    [[TMP21:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-NEXT:    [[TMP22:%.*]] = or i32 [[TMP13]], [[TMP20]]
+; X86-NEXT:    [[TMP23:%.*]] = or i32 [[TMP21]], [[TMP22]]
+; X86-NEXT:    [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0
+; X86-NEXT:    [[TMP25:%.*]] = zext i1 [[TMP24]] to i32
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP25]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length14_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; X86-SSE1-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP11]], [[TMP12]]
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-SSE1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 12
+; X86-SSE1-NEXT:    [[TMP16:%.*]] = load i16, ptr [[TMP14]], align 1
+; X86-SSE1-NEXT:    [[TMP17:%.*]] = load i16, ptr [[TMP15]], align 1
+; X86-SSE1-NEXT:    [[TMP18:%.*]] = zext i16 [[TMP16]] to i32
+; X86-SSE1-NEXT:    [[TMP19:%.*]] = zext i16 [[TMP17]] to i32
+; X86-SSE1-NEXT:    [[TMP20:%.*]] = xor i32 [[TMP18]], [[TMP19]]
+; X86-SSE1-NEXT:    [[TMP21:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE1-NEXT:    [[TMP22:%.*]] = or i32 [[TMP13]], [[TMP20]]
+; X86-SSE1-NEXT:    [[TMP23:%.*]] = or i32 [[TMP21]], [[TMP22]]
+; X86-SSE1-NEXT:    [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0
+; X86-SSE1-NEXT:    [[TMP25:%.*]] = zext i1 [[TMP24]] to i32
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP25]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length14_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP11]], [[TMP12]]
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-SSE2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 12
+; X86-SSE2-NEXT:    [[TMP16:%.*]] = load i16, ptr [[TMP14]], align 1
+; X86-SSE2-NEXT:    [[TMP17:%.*]] = load i16, ptr [[TMP15]], align 1
+; X86-SSE2-NEXT:    [[TMP18:%.*]] = zext i16 [[TMP16]] to i32
+; X86-SSE2-NEXT:    [[TMP19:%.*]] = zext i16 [[TMP17]] to i32
+; X86-SSE2-NEXT:    [[TMP20:%.*]] = xor i32 [[TMP18]], [[TMP19]]
+; X86-SSE2-NEXT:    [[TMP21:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP22:%.*]] = or i32 [[TMP13]], [[TMP20]]
+; X86-SSE2-NEXT:    [[TMP23:%.*]] = or i32 [[TMP21]], [[TMP22]]
+; X86-SSE2-NEXT:    [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0
+; X86-SSE2-NEXT:    [[TMP25:%.*]] = zext i1 [[TMP24]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP25]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length14_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP11]], [[TMP12]]
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 12
+; X86-SSE41-NEXT:    [[TMP16:%.*]] = load i16, ptr [[TMP14]], align 1
+; X86-SSE41-NEXT:    [[TMP17:%.*]] = load i16, ptr [[TMP15]], align 1
+; X86-SSE41-NEXT:    [[TMP18:%.*]] = zext i16 [[TMP16]] to i32
+; X86-SSE41-NEXT:    [[TMP19:%.*]] = zext i16 [[TMP17]] to i32
+; X86-SSE41-NEXT:    [[TMP20:%.*]] = xor i32 [[TMP18]], [[TMP19]]
+; X86-SSE41-NEXT:    [[TMP21:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP22:%.*]] = or i32 [[TMP13]], [[TMP20]]
+; X86-SSE41-NEXT:    [[TMP23:%.*]] = or i32 [[TMP21]], [[TMP22]]
+; X86-SSE41-NEXT:    [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0
+; X86-SSE41-NEXT:    [[TMP25:%.*]] = zext i1 [[TMP24]] to i32
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP25]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 14) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length15_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length15_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; X86-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP11]], [[TMP12]]
+; X86-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 11
+; X86-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 11
+; X86-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP14]], align 1
+; X86-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; X86-NEXT:    [[TMP18:%.*]] = xor i32 [[TMP16]], [[TMP17]]
+; X86-NEXT:    [[TMP19:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-NEXT:    [[TMP20:%.*]] = or i32 [[TMP13]], [[TMP18]]
+; X86-NEXT:    [[TMP21:%.*]] = or i32 [[TMP19]], [[TMP20]]
+; X86-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
+; X86-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP23]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length15_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; X86-SSE1-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP11]], [[TMP12]]
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 11
+; X86-SSE1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 11
+; X86-SSE1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP14]], align 1
+; X86-SSE1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; X86-SSE1-NEXT:    [[TMP18:%.*]] = xor i32 [[TMP16]], [[TMP17]]
+; X86-SSE1-NEXT:    [[TMP19:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE1-NEXT:    [[TMP20:%.*]] = or i32 [[TMP13]], [[TMP18]]
+; X86-SSE1-NEXT:    [[TMP21:%.*]] = or i32 [[TMP19]], [[TMP20]]
+; X86-SSE1-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
+; X86-SSE1-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP23]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length15_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP11]], [[TMP12]]
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 11
+; X86-SSE2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 11
+; X86-SSE2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP14]], align 1
+; X86-SSE2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; X86-SSE2-NEXT:    [[TMP18:%.*]] = xor i32 [[TMP16]], [[TMP17]]
+; X86-SSE2-NEXT:    [[TMP19:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP20:%.*]] = or i32 [[TMP13]], [[TMP18]]
+; X86-SSE2-NEXT:    [[TMP21:%.*]] = or i32 [[TMP19]], [[TMP20]]
+; X86-SSE2-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
+; X86-SSE2-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP23]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length15_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP11]], [[TMP12]]
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 11
+; X86-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 11
+; X86-SSE41-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP14]], align 1
+; X86-SSE41-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; X86-SSE41-NEXT:    [[TMP18:%.*]] = xor i32 [[TMP16]], [[TMP17]]
+; X86-SSE41-NEXT:    [[TMP19:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP20:%.*]] = or i32 [[TMP13]], [[TMP18]]
+; X86-SSE41-NEXT:    [[TMP21:%.*]] = or i32 [[TMP19]], [[TMP20]]
+; X86-SSE41-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
+; X86-SSE41-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP23]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 15) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
+
+define i32 @length16(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length16(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    br label [[LOADBB:%.*]]
+; X86:       res_block:
+; X86-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X86-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X86-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86:       loadbb:
+; X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86:       loadbb1:
+; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X86:       loadbb2:
+; X86-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; X86-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 1
+; X86-NEXT:    [[TMP19]] = call i32 @llvm.bswap.i32(i32 [[TMP17]])
+; X86-NEXT:    [[TMP20]] = call i32 @llvm.bswap.i32(i32 [[TMP18]])
+; X86-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], [[TMP20]]
+; X86-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X86:       loadbb3:
+; X86-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 12
+; X86-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP22]], align 1
+; X86-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP23]], align 1
+; X86-NEXT:    [[TMP26]] = call i32 @llvm.bswap.i32(i32 [[TMP24]])
+; X86-NEXT:    [[TMP27]] = call i32 @llvm.bswap.i32(i32 [[TMP25]])
+; X86-NEXT:    [[TMP28:%.*]] = icmp eq i32 [[TMP26]], [[TMP27]]
+; X86-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86:       endblock:
+; X86-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE1-LABEL: define i32 @length16(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE1:       res_block:
+; X86-SSE1-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X86-SSE1-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE1:       loadbb:
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE1:       loadbb1:
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE1-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE1-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X86-SSE1:       loadbb2:
+; X86-SSE1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; X86-SSE1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 1
+; X86-SSE1-NEXT:    [[TMP19]] = call i32 @llvm.bswap.i32(i32 [[TMP17]])
+; X86-SSE1-NEXT:    [[TMP20]] = call i32 @llvm.bswap.i32(i32 [[TMP18]])
+; X86-SSE1-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], [[TMP20]]
+; X86-SSE1-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X86-SSE1:       loadbb3:
+; X86-SSE1-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-SSE1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 12
+; X86-SSE1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP22]], align 1
+; X86-SSE1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP23]], align 1
+; X86-SSE1-NEXT:    [[TMP26]] = call i32 @llvm.bswap.i32(i32 [[TMP24]])
+; X86-SSE1-NEXT:    [[TMP27]] = call i32 @llvm.bswap.i32(i32 [[TMP25]])
+; X86-SSE1-NEXT:    [[TMP28:%.*]] = icmp eq i32 [[TMP26]], [[TMP27]]
+; X86-SSE1-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE1:       endblock:
+; X86-SSE1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE2-LABEL: define i32 @length16(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE2:       res_block:
+; X86-SSE2-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X86-SSE2-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE2:       loadbb:
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE2:       loadbb1:
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE2-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X86-SSE2:       loadbb2:
+; X86-SSE2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; X86-SSE2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 1
+; X86-SSE2-NEXT:    [[TMP19]] = call i32 @llvm.bswap.i32(i32 [[TMP17]])
+; X86-SSE2-NEXT:    [[TMP20]] = call i32 @llvm.bswap.i32(i32 [[TMP18]])
+; X86-SSE2-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], [[TMP20]]
+; X86-SSE2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X86-SSE2:       loadbb3:
+; X86-SSE2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-SSE2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 12
+; X86-SSE2-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP22]], align 1
+; X86-SSE2-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP23]], align 1
+; X86-SSE2-NEXT:    [[TMP26]] = call i32 @llvm.bswap.i32(i32 [[TMP24]])
+; X86-SSE2-NEXT:    [[TMP27]] = call i32 @llvm.bswap.i32(i32 [[TMP25]])
+; X86-SSE2-NEXT:    [[TMP28:%.*]] = icmp eq i32 [[TMP26]], [[TMP27]]
+; X86-SSE2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE2:       endblock:
+; X86-SSE2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE41-LABEL: define i32 @length16(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE41:       res_block:
+; X86-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X86-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE41:       loadbb:
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE41-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE41:       loadbb1:
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE41-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE41-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X86-SSE41:       loadbb2:
+; X86-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE41-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE41-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; X86-SSE41-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 1
+; X86-SSE41-NEXT:    [[TMP19]] = call i32 @llvm.bswap.i32(i32 [[TMP17]])
+; X86-SSE41-NEXT:    [[TMP20]] = call i32 @llvm.bswap.i32(i32 [[TMP18]])
+; X86-SSE41-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], [[TMP20]]
+; X86-SSE41-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X86-SSE41:       loadbb3:
+; X86-SSE41-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-SSE41-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 12
+; X86-SSE41-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP22]], align 1
+; X86-SSE41-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP23]], align 1
+; X86-SSE41-NEXT:    [[TMP26]] = call i32 @llvm.bswap.i32(i32 [[TMP24]])
+; X86-SSE41-NEXT:    [[TMP27]] = call i32 @llvm.bswap.i32(i32 [[TMP25]])
+; X86-SSE41-NEXT:    [[TMP28:%.*]] = icmp eq i32 [[TMP26]], [[TMP27]]
+; X86-SSE41-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE41:       endblock:
+; X86-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 16) nounwind
+  ret i32 %m
+}
+
+define i1 @length16_eq(ptr %x, ptr %y) nounwind {
+; X86-NOSSE-LABEL: length16_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl %esi
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOSSE-NEXT:    movl (%edx), %esi
+; X86-NOSSE-NEXT:    movl 4(%edx), %eax
+; X86-NOSSE-NEXT:    xorl (%ecx), %esi
+; X86-NOSSE-NEXT:    xorl 4(%ecx), %eax
+; X86-NOSSE-NEXT:    orl %esi, %eax
+; X86-NOSSE-NEXT:    movl 8(%edx), %esi
+; X86-NOSSE-NEXT:    xorl 8(%ecx), %esi
+; X86-NOSSE-NEXT:    movl 12(%edx), %edx
+; X86-NOSSE-NEXT:    xorl 12(%ecx), %edx
+; X86-NOSSE-NEXT:    orl %esi, %edx
+; X86-NOSSE-NEXT:    orl %eax, %edx
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    popl %esi
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length16_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; X86-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP11]], [[TMP12]]
+; X86-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 12
+; X86-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP14]], align 1
+; X86-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; X86-NEXT:    [[TMP18:%.*]] = xor i32 [[TMP16]], [[TMP17]]
+; X86-NEXT:    [[TMP19:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-NEXT:    [[TMP20:%.*]] = or i32 [[TMP13]], [[TMP18]]
+; X86-NEXT:    [[TMP21:%.*]] = or i32 [[TMP19]], [[TMP20]]
+; X86-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
+; X86-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X86-NEXT:    ret i1 [[TMP22]]
+;
+; X86-SSE1-LABEL: define i1 @length16_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 1
+; X86-SSE1-NEXT:    [[TMP13:%.*]] = xor i32 [[TMP11]], [[TMP12]]
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-SSE1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 12
+; X86-SSE1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP14]], align 1
+; X86-SSE1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; X86-SSE1-NEXT:    [[TMP18:%.*]] = xor i32 [[TMP16]], [[TMP17]]
+; X86-SSE1-NEXT:    [[TMP19:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE1-NEXT:    [[TMP20:%.*]] = or i32 [[TMP13]], [[TMP18]]
+; X86-SSE1-NEXT:    [[TMP21:%.*]] = or i32 [[TMP19]], [[TMP20]]
+; X86-SSE1-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
+; X86-SSE1-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X86-SSE1-NEXT:    ret i1 [[TMP22]]
+;
+; X86-SSE2-LABEL: define i1 @length16_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP3]]
+;
+; X86-SSE41-LABEL: define i1 @length16_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP3]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 16) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length16_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    br label [[LOADBB:%.*]]
+; X86:       res_block:
+; X86-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X86-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X86-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86:       loadbb:
+; X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86:       loadbb1:
+; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X86:       loadbb2:
+; X86-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; X86-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 1
+; X86-NEXT:    [[TMP19]] = call i32 @llvm.bswap.i32(i32 [[TMP17]])
+; X86-NEXT:    [[TMP20]] = call i32 @llvm.bswap.i32(i32 [[TMP18]])
+; X86-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], [[TMP20]]
+; X86-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X86:       loadbb3:
+; X86-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 12
+; X86-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP22]], align 1
+; X86-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP23]], align 1
+; X86-NEXT:    [[TMP26]] = call i32 @llvm.bswap.i32(i32 [[TMP24]])
+; X86-NEXT:    [[TMP27]] = call i32 @llvm.bswap.i32(i32 [[TMP25]])
+; X86-NEXT:    [[TMP28:%.*]] = icmp eq i32 [[TMP26]], [[TMP27]]
+; X86-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86:       endblock:
+; X86-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length16_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE1:       res_block:
+; X86-SSE1-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X86-SSE1-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE1:       loadbb:
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE1:       loadbb1:
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE1-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE1-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X86-SSE1:       loadbb2:
+; X86-SSE1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; X86-SSE1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 1
+; X86-SSE1-NEXT:    [[TMP19]] = call i32 @llvm.bswap.i32(i32 [[TMP17]])
+; X86-SSE1-NEXT:    [[TMP20]] = call i32 @llvm.bswap.i32(i32 [[TMP18]])
+; X86-SSE1-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], [[TMP20]]
+; X86-SSE1-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X86-SSE1:       loadbb3:
+; X86-SSE1-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-SSE1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 12
+; X86-SSE1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP22]], align 1
+; X86-SSE1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP23]], align 1
+; X86-SSE1-NEXT:    [[TMP26]] = call i32 @llvm.bswap.i32(i32 [[TMP24]])
+; X86-SSE1-NEXT:    [[TMP27]] = call i32 @llvm.bswap.i32(i32 [[TMP25]])
+; X86-SSE1-NEXT:    [[TMP28:%.*]] = icmp eq i32 [[TMP26]], [[TMP27]]
+; X86-SSE1-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE1:       endblock:
+; X86-SSE1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length16_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE2:       res_block:
+; X86-SSE2-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X86-SSE2-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE2:       loadbb:
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE2:       loadbb1:
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE2-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X86-SSE2:       loadbb2:
+; X86-SSE2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; X86-SSE2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 1
+; X86-SSE2-NEXT:    [[TMP19]] = call i32 @llvm.bswap.i32(i32 [[TMP17]])
+; X86-SSE2-NEXT:    [[TMP20]] = call i32 @llvm.bswap.i32(i32 [[TMP18]])
+; X86-SSE2-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], [[TMP20]]
+; X86-SSE2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X86-SSE2:       loadbb3:
+; X86-SSE2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-SSE2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 12
+; X86-SSE2-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP22]], align 1
+; X86-SSE2-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP23]], align 1
+; X86-SSE2-NEXT:    [[TMP26]] = call i32 @llvm.bswap.i32(i32 [[TMP24]])
+; X86-SSE2-NEXT:    [[TMP27]] = call i32 @llvm.bswap.i32(i32 [[TMP25]])
+; X86-SSE2-NEXT:    [[TMP28:%.*]] = icmp eq i32 [[TMP26]], [[TMP27]]
+; X86-SSE2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE2:       endblock:
+; X86-SSE2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length16_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE41:       res_block:
+; X86-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X86-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE41:       loadbb:
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE41-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE41:       loadbb1:
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE41-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE41-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X86-SSE41:       loadbb2:
+; X86-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE41-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE41-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; X86-SSE41-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 1
+; X86-SSE41-NEXT:    [[TMP19]] = call i32 @llvm.bswap.i32(i32 [[TMP17]])
+; X86-SSE41-NEXT:    [[TMP20]] = call i32 @llvm.bswap.i32(i32 [[TMP18]])
+; X86-SSE41-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], [[TMP20]]
+; X86-SSE41-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X86-SSE41:       loadbb3:
+; X86-SSE41-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-SSE41-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 12
+; X86-SSE41-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP22]], align 1
+; X86-SSE41-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP23]], align 1
+; X86-SSE41-NEXT:    [[TMP26]] = call i32 @llvm.bswap.i32(i32 [[TMP24]])
+; X86-SSE41-NEXT:    [[TMP27]] = call i32 @llvm.bswap.i32(i32 [[TMP25]])
+; X86-SSE41-NEXT:    [[TMP28:%.*]] = icmp eq i32 [[TMP26]], [[TMP27]]
+; X86-SSE41-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE41:       endblock:
+; X86-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 16) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length16_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    br label [[LOADBB:%.*]]
+; X86:       res_block:
+; X86-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X86-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X86-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86:       loadbb:
+; X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86:       loadbb1:
+; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X86:       loadbb2:
+; X86-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; X86-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 1
+; X86-NEXT:    [[TMP19]] = call i32 @llvm.bswap.i32(i32 [[TMP17]])
+; X86-NEXT:    [[TMP20]] = call i32 @llvm.bswap.i32(i32 [[TMP18]])
+; X86-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], [[TMP20]]
+; X86-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X86:       loadbb3:
+; X86-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 12
+; X86-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP22]], align 1
+; X86-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP23]], align 1
+; X86-NEXT:    [[TMP26]] = call i32 @llvm.bswap.i32(i32 [[TMP24]])
+; X86-NEXT:    [[TMP27]] = call i32 @llvm.bswap.i32(i32 [[TMP25]])
+; X86-NEXT:    [[TMP28:%.*]] = icmp eq i32 [[TMP26]], [[TMP27]]
+; X86-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86:       endblock:
+; X86-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length16_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE1:       res_block:
+; X86-SSE1-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X86-SSE1-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE1:       loadbb:
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE1:       loadbb1:
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE1-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE1-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X86-SSE1:       loadbb2:
+; X86-SSE1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; X86-SSE1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 1
+; X86-SSE1-NEXT:    [[TMP19]] = call i32 @llvm.bswap.i32(i32 [[TMP17]])
+; X86-SSE1-NEXT:    [[TMP20]] = call i32 @llvm.bswap.i32(i32 [[TMP18]])
+; X86-SSE1-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], [[TMP20]]
+; X86-SSE1-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X86-SSE1:       loadbb3:
+; X86-SSE1-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-SSE1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 12
+; X86-SSE1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP22]], align 1
+; X86-SSE1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP23]], align 1
+; X86-SSE1-NEXT:    [[TMP26]] = call i32 @llvm.bswap.i32(i32 [[TMP24]])
+; X86-SSE1-NEXT:    [[TMP27]] = call i32 @llvm.bswap.i32(i32 [[TMP25]])
+; X86-SSE1-NEXT:    [[TMP28:%.*]] = icmp eq i32 [[TMP26]], [[TMP27]]
+; X86-SSE1-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE1:       endblock:
+; X86-SSE1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length16_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE2:       res_block:
+; X86-SSE2-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X86-SSE2-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE2:       loadbb:
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE2:       loadbb1:
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE2-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X86-SSE2:       loadbb2:
+; X86-SSE2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; X86-SSE2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 1
+; X86-SSE2-NEXT:    [[TMP19]] = call i32 @llvm.bswap.i32(i32 [[TMP17]])
+; X86-SSE2-NEXT:    [[TMP20]] = call i32 @llvm.bswap.i32(i32 [[TMP18]])
+; X86-SSE2-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], [[TMP20]]
+; X86-SSE2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X86-SSE2:       loadbb3:
+; X86-SSE2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-SSE2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 12
+; X86-SSE2-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP22]], align 1
+; X86-SSE2-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP23]], align 1
+; X86-SSE2-NEXT:    [[TMP26]] = call i32 @llvm.bswap.i32(i32 [[TMP24]])
+; X86-SSE2-NEXT:    [[TMP27]] = call i32 @llvm.bswap.i32(i32 [[TMP25]])
+; X86-SSE2-NEXT:    [[TMP28:%.*]] = icmp eq i32 [[TMP26]], [[TMP27]]
+; X86-SSE2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE2:       endblock:
+; X86-SSE2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length16_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE41:       res_block:
+; X86-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X86-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE41:       loadbb:
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE41-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE41:       loadbb1:
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE41-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE41-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X86-SSE41:       loadbb2:
+; X86-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE41-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE41-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP15]], align 1
+; X86-SSE41-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 1
+; X86-SSE41-NEXT:    [[TMP19]] = call i32 @llvm.bswap.i32(i32 [[TMP17]])
+; X86-SSE41-NEXT:    [[TMP20]] = call i32 @llvm.bswap.i32(i32 [[TMP18]])
+; X86-SSE41-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[TMP19]], [[TMP20]]
+; X86-SSE41-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X86-SSE41:       loadbb3:
+; X86-SSE41-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-SSE41-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 12
+; X86-SSE41-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP22]], align 1
+; X86-SSE41-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP23]], align 1
+; X86-SSE41-NEXT:    [[TMP26]] = call i32 @llvm.bswap.i32(i32 [[TMP24]])
+; X86-SSE41-NEXT:    [[TMP27]] = call i32 @llvm.bswap.i32(i32 [[TMP25]])
+; X86-SSE41-NEXT:    [[TMP28:%.*]] = icmp eq i32 [[TMP26]], [[TMP27]]
+; X86-SSE41-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE41:       endblock:
+; X86-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 16) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_eq_const(ptr %X) nounwind {
+; X86-NOSSE-LABEL: length16_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl %esi
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl $858927408, %ecx # imm = 0x33323130
+; X86-NOSSE-NEXT:    xorl (%eax), %ecx
+; X86-NOSSE-NEXT:    movl $926299444, %edx # imm = 0x37363534
+; X86-NOSSE-NEXT:    xorl 4(%eax), %edx
+; X86-NOSSE-NEXT:    orl %ecx, %edx
+; X86-NOSSE-NEXT:    movl $825243960, %ecx # imm = 0x31303938
+; X86-NOSSE-NEXT:    xorl 8(%eax), %ecx
+; X86-NOSSE-NEXT:    movl $892613426, %esi # imm = 0x35343332
+; X86-NOSSE-NEXT:    xorl 12(%eax), %esi
+; X86-NOSSE-NEXT:    orl %ecx, %esi
+; X86-NOSSE-NEXT:    orl %edx, %esi
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    popl %esi
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length16_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = xor i32 [[TMP1]], 858927408
+; X86-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 1
+; X86-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP4]], 926299444
+; X86-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP7]], 825243960
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = xor i32 [[TMP10]], 892613426
+; X86-NEXT:    [[TMP12:%.*]] = or i32 [[TMP2]], [[TMP5]]
+; X86-NEXT:    [[TMP13:%.*]] = or i32 [[TMP8]], [[TMP11]]
+; X86-NEXT:    [[TMP14:%.*]] = or i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    [[TMP15:%.*]] = icmp ne i32 [[TMP14]], 0
+; X86-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length16_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = xor i32 [[TMP1]], 858927408
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 1
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP4]], 926299444
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 1
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP7]], 825243960
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 12
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = xor i32 [[TMP10]], 892613426
+; X86-SSE1-NEXT:    [[TMP12:%.*]] = or i32 [[TMP2]], [[TMP5]]
+; X86-SSE1-NEXT:    [[TMP13:%.*]] = or i32 [[TMP8]], [[TMP11]]
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = or i32 [[TMP12]], [[TMP13]]
+; X86-SSE1-NEXT:    [[TMP15:%.*]] = icmp ne i32 [[TMP14]], 0
+; X86-SSE1-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length16_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length16_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 16) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
+
+define i32 @length24(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length24(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5:[0-9]+]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length24(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5:[0-9]+]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length24(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5:[0-9]+]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length24(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5:[0-9]+]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 24) nounwind
+  ret i32 %m
+}
+
+define i1 @length24_eq(ptr %x, ptr %y) nounwind {
+; X86-NOSSE-LABEL: length24_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $24
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length24_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length24_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length24_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length24_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 24) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length24_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length24_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length24_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length24_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 24) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length24_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length24_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length24_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length24_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 24) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_eq_const(ptr %X) nounwind {
+; X86-NOSSE-LABEL: length24_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $24
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length24_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 24) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length24_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 24) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length24_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 68051240286688436651889234231545575736
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP7]]
+;
+; X86-SSE41-LABEL: define i1 @length24_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 68051240286688436651889234231545575736
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 24) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length31(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length31(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length31(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length31(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length31(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 31) nounwind
+  ret i32 %m
+}
+
+define i1 @length31_eq(ptr %x, ptr %y) nounwind {
+; X86-NOSSE-LABEL: length31_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $31
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length31_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length31_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length31_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length31_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 31) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length31_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length31_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length31_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length31_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 31) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length31_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length31_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length31_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length31_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 31) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
+; X86-NOSSE-LABEL: length31_eq_prefer128:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $31
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length31_eq_prefer128(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length31_eq_prefer128(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length31_eq_prefer128(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length31_eq_prefer128(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 31) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_eq_const(ptr %X) nounwind {
+; X86-NOSSE-LABEL: length31_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $31
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length31_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 31) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length31_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 31) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length31_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP7]]
+;
+; X86-SSE41-LABEL: define i1 @length31_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 31) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length32(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length32(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length32(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length32(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length32(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 32) nounwind
+  ret i32 %m
+}
+
+; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
+
+define i1 @length32_eq(ptr %x, ptr %y) nounwind {
+; X86-NOSSE-LABEL: length32_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $32
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length32_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length32_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length32_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length32_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length32_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length32_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length32_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length32_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 32) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length32_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length32_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length32_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length32_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 32) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
+; X86-NOSSE-LABEL: length32_eq_prefer128:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $32
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length32_eq_prefer128(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length32_eq_prefer128(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length32_eq_prefer128(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length32_eq_prefer128(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_const(ptr %X) nounwind {
+; X86-NOSSE-LABEL: length32_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $32
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length32_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 32) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length32_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 32) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length32_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP7]]
+;
+; X86-SSE41-LABEL: define i1 @length32_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 32) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length48(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length48(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length48(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length48(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length48(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 48) nounwind
+  ret i32 %m
+}
+
+define i1 @length48_eq(ptr %x, ptr %y) nounwind {
+; X86-NOSSE-LABEL: length48_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $48
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length48_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length48_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length48_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X86-SSE2-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X86-SSE2-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length48_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X86-SSE41-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X86-SSE41-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 48) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length48_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length48_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length48_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length48_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 48) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length48_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length48_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length48_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length48_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 48) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
+; X86-NOSSE-LABEL: length48_eq_prefer128:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $48
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length48_eq_prefer128(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length48_eq_prefer128(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length48_eq_prefer128(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X86-SSE2-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X86-SSE2-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length48_eq_prefer128(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X86-SSE41-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X86-SSE41-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 48) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_eq_const(ptr %X) nounwind {
+; X86-NOSSE-LABEL: length48_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $48
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length48_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 48) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length48_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 48) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length48_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP6]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP7]], 73389002901949112059321871464991568690
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = or i128 [[TMP9]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = icmp ne i128 [[TMP10]], 0
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = zext i1 [[TMP11]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP11]]
+;
+; X86-SSE41-LABEL: define i1 @length48_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP6]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP7]], 73389002901949112059321871464991568690
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = or i128 [[TMP9]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = icmp ne i128 [[TMP10]], 0
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = zext i1 [[TMP11]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP11]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 48) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length63(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length63(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length63(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length63(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length63(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 63) nounwind
+  ret i32 %m
+}
+
+define i1 @length63_eq(ptr %x, ptr %y) nounwind {
+; X86-NOSSE-LABEL: length63_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $63
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length63_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length63_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length63_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 47
+; X86-SSE2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 47
+; X86-SSE2-NEXT:    [[TMP16:%.*]] = load i128, ptr [[TMP14]], align 1
+; X86-SSE2-NEXT:    [[TMP17:%.*]] = load i128, ptr [[TMP15]], align 1
+; X86-SSE2-NEXT:    [[TMP18:%.*]] = xor i128 [[TMP16]], [[TMP17]]
+; X86-SSE2-NEXT:    [[TMP19:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP20:%.*]] = or i128 [[TMP13]], [[TMP18]]
+; X86-SSE2-NEXT:    [[TMP21:%.*]] = or i128 [[TMP19]], [[TMP20]]
+; X86-SSE2-NEXT:    [[TMP22:%.*]] = icmp ne i128 [[TMP21]], 0
+; X86-SSE2-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP22]]
+;
+; X86-SSE41-LABEL: define i1 @length63_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 47
+; X86-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 47
+; X86-SSE41-NEXT:    [[TMP16:%.*]] = load i128, ptr [[TMP14]], align 1
+; X86-SSE41-NEXT:    [[TMP17:%.*]] = load i128, ptr [[TMP15]], align 1
+; X86-SSE41-NEXT:    [[TMP18:%.*]] = xor i128 [[TMP16]], [[TMP17]]
+; X86-SSE41-NEXT:    [[TMP19:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP20:%.*]] = or i128 [[TMP13]], [[TMP18]]
+; X86-SSE41-NEXT:    [[TMP21:%.*]] = or i128 [[TMP19]], [[TMP20]]
+; X86-SSE41-NEXT:    [[TMP22:%.*]] = icmp ne i128 [[TMP21]], 0
+; X86-SSE41-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP22]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 63) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length63_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length63_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length63_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length63_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length63_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 63) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length63_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length63_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length63_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length63_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length63_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 63) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length63_eq_const(ptr %X) nounwind {
+; X86-NOSSE-LABEL: length63_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $63
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length63_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 63) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length63_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 63) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length63_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP6]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP7]], 73389002901949112059321871464991568690
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 47
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i128, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = xor i128 [[TMP10]], 66716800424378146251538984255488604215
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = or i128 [[TMP8]], [[TMP11]]
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = or i128 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    [[TMP15:%.*]] = icmp ne i128 [[TMP14]], 0
+; X86-SSE2-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length63_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP6]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP7]], 73389002901949112059321871464991568690
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 47
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = load i128, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = xor i128 [[TMP10]], 66716800424378146251538984255488604215
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = or i128 [[TMP8]], [[TMP11]]
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = or i128 [[TMP12]], [[TMP13]]
+; X86-SSE41-NEXT:    [[TMP15:%.*]] = icmp ne i128 [[TMP14]], 0
+; X86-SSE41-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 63) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length64(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length64(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length64(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length64(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length64(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 64) nounwind
+  ret i32 %m
+}
+
+define i1 @length64_eq(ptr %x, ptr %y) nounwind {
+; X86-NOSSE-LABEL: length64_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $64
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length64_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length64_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length64_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 48
+; X86-SSE2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 48
+; X86-SSE2-NEXT:    [[TMP16:%.*]] = load i128, ptr [[TMP14]], align 1
+; X86-SSE2-NEXT:    [[TMP17:%.*]] = load i128, ptr [[TMP15]], align 1
+; X86-SSE2-NEXT:    [[TMP18:%.*]] = xor i128 [[TMP16]], [[TMP17]]
+; X86-SSE2-NEXT:    [[TMP19:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP20:%.*]] = or i128 [[TMP13]], [[TMP18]]
+; X86-SSE2-NEXT:    [[TMP21:%.*]] = or i128 [[TMP19]], [[TMP20]]
+; X86-SSE2-NEXT:    [[TMP22:%.*]] = icmp ne i128 [[TMP21]], 0
+; X86-SSE2-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP22]]
+;
+; X86-SSE41-LABEL: define i1 @length64_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 48
+; X86-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 48
+; X86-SSE41-NEXT:    [[TMP16:%.*]] = load i128, ptr [[TMP14]], align 1
+; X86-SSE41-NEXT:    [[TMP17:%.*]] = load i128, ptr [[TMP15]], align 1
+; X86-SSE41-NEXT:    [[TMP18:%.*]] = xor i128 [[TMP16]], [[TMP17]]
+; X86-SSE41-NEXT:    [[TMP19:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP20:%.*]] = or i128 [[TMP13]], [[TMP18]]
+; X86-SSE41-NEXT:    [[TMP21:%.*]] = or i128 [[TMP19]], [[TMP20]]
+; X86-SSE41-NEXT:    [[TMP22:%.*]] = icmp ne i128 [[TMP21]], 0
+; X86-SSE41-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP22]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 64) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length64_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length64_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length64_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length64_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 64) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length64_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length64_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length64_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length64_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 64) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_eq_const(ptr %X) nounwind {
+; X86-NOSSE-LABEL: length64_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $64
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length64_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 64) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length64_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 64) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length64_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP6]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP7]], 73389002901949112059321871464991568690
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 48
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i128, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = xor i128 [[TMP10]], 68051240286688436651889234231545575736
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = or i128 [[TMP8]], [[TMP11]]
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = or i128 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    [[TMP15:%.*]] = icmp ne i128 [[TMP14]], 0
+; X86-SSE2-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length64_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP6]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP7]], 73389002901949112059321871464991568690
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 48
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = load i128, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = xor i128 [[TMP10]], 68051240286688436651889234231545575736
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = or i128 [[TMP8]], [[TMP11]]
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = or i128 [[TMP12]], [[TMP13]]
+; X86-SSE41-NEXT:    [[TMP15:%.*]] = icmp ne i128 [[TMP14]], 0
+; X86-SSE41-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 64) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length96(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length96(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length96(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length96(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length96(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 96) nounwind
+  ret i32 %m
+}
+
+define i1 @length96_eq(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length96_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length96_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length96_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length96_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 96) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length96_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length96_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length96_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length96_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length96_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 96) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length96_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length96_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length96_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length96_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length96_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 96) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length96_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length96_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 96) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length96_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 96) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length96_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 96) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length96_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 96) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 96) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length127(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length127(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length127(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length127(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length127(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 127) nounwind
+  ret i32 %m
+}
+
+define i1 @length127_eq(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length127_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length127_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length127_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length127_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 127) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length127_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length127_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length127_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length127_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length127_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 127) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length127_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length127_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length127_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length127_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length127_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 127) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length127_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length127_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 127) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length127_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 127) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length127_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 127) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length127_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 127) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 127) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length128(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length128(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length128(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length128(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length128(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 128) nounwind
+  ret i32 %m
+}
+
+define i1 @length128_eq(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length128_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length128_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length128_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length128_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 128) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length128_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length128_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length128_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length128_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length128_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 128) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length128_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length128_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length128_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length128_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length128_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 128) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length128_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length128_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 128) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length128_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 128) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length128_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 128) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length128_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 128) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 128) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length192(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length192(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length192(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length192(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length192(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 192) nounwind
+  ret i32 %m
+}
+
+define i1 @length192_eq(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length192_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length192_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length192_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length192_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 192) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length192_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length192_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length192_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length192_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length192_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 192) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length192_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length192_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length192_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length192_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length192_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 192) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length192_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length192_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 192) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length192_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 192) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length192_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 192) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length192_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 192) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 192) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length255(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length255(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length255(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length255(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length255(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 255) nounwind
+  ret i32 %m
+}
+
+define i1 @length255_eq(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length255_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length255_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length255_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length255_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 255) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length255_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length255_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length255_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length255_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length255_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 255) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length255_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length255_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length255_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length255_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length255_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 255) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length255_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length255_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 255) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length255_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 255) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length255_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 255) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length255_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 255) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 255) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length256(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length256(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length256(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length256(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length256(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 256) nounwind
+  ret i32 %m
+}
+
+define i1 @length256_eq(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length256_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length256_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length256_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length256_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 256) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length256_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length256_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length256_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length256_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length256_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 256) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length256_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length256_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length256_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length256_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length256_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 256) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length256_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length256_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 256) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length256_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 256) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length256_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 256) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length256_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 256) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 256) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length384(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length384(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length384(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length384(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length384(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 384) nounwind
+  ret i32 %m
+}
+
+define i1 @length384_eq(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length384_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length384_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length384_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length384_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 384) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length384_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length384_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length384_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length384_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length384_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 384) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length384_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length384_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length384_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length384_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length384_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 384) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length384_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length384_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 384) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length384_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 384) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length384_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 384) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length384_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 384) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 384) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length511(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length511(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length511(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length511(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length511(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 511) nounwind
+  ret i32 %m
+}
+
+define i1 @length511_eq(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length511_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length511_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length511_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length511_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 511) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length511_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length511_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length511_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length511_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length511_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 511) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length511_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length511_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length511_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length511_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length511_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 511) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length511_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length511_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 511) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length511_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 511) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length511_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 511) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length511_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 511) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 511) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length512(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length512(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length512(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length512(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length512(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 512) nounwind
+  ret i32 %m
+}
+
+define i1 @length512_eq(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length512_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length512_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length512_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length512_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 512) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length512_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length512_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length512_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length512_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length512_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 512) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length512_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length512_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length512_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length512_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length512_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 512) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length512_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length512_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 512) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length512_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 512) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length512_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 512) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length512_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 512) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 512) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; This checks that we do not do stupid things with huge sizes.
+define i32 @huge_length(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @huge_length(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 -1) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @huge_length(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 -1) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @huge_length(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 -1) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @huge_length(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 -1) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 9223372036854775807) nounwind
+  ret i32 %m
+}
+
+define i1 @huge_length_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @huge_length_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 -1) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @huge_length_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 -1) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @huge_length_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 -1) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @huge_length_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 -1) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 9223372036854775807) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; This checks non-constant sizes.
+define i32 @nonconst_length(ptr %X, ptr %Y, i32 %size) nounwind {
+; X86-LABEL: define i32 @nonconst_length(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i32 [[SIZE:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 [[SIZE]]) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @nonconst_length(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i32 [[SIZE:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 [[SIZE]]) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @nonconst_length(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i32 [[SIZE:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 [[SIZE]]) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @nonconst_length(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i32 [[SIZE:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 [[SIZE]]) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 %size) nounwind
+  ret i32 %m
+}
+
+define i1 @nonconst_length_eq(ptr %X, ptr %Y, i32 %size) nounwind {
+; X86-LABEL: define i1 @nonconst_length_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i32 [[SIZE:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 [[SIZE]]) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @nonconst_length_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i32 [[SIZE:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 [[SIZE]]) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @nonconst_length_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i32 [[SIZE:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 [[SIZE]]) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @nonconst_length_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i32 [[SIZE:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 [[SIZE]]) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 %size) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-more-load-pairs.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-more-load-pairs.ll
new file mode 100644
index 00000000000000..56489a08800b76
--- /dev/null
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-more-load-pairs.ll
@@ -0,0 +1,18833 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; NOTE: This is a copy of llvm/test/CodeGen/X86/memcmp.ll with more load pairs. Please keep it that way.
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4  -mtriple=x86_64-unknown-unknown               < %s | FileCheck %s --check-prefixes=X64
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 -mtriple=x86_64-unknown-unknown -mattr=sse4.1 < %s | FileCheck %s --check-prefixes=X64-SSE41
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 -mtriple=x86_64-unknown-unknown -mattr=avx    < %s | FileCheck %s --check-prefixes=X64-AVX1
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 -mtriple=x86_64-unknown-unknown -mattr=avx2   < %s | FileCheck %s --check-prefixes=X64-AVX2
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 -mtriple=x86_64-unknown-unknown -mattr=avx512bw,+prefer-256-bit < %s | FileCheck %s --check-prefixes=X64-AVX512BW-256
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 -mtriple=x86_64-unknown-unknown -mattr=avx512bw,-prefer-256-bit < %s | FileCheck %s --check-prefixes=X64-AVX512BW
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 -mtriple=x86_64-unknown-unknown -mattr=avx512f,+prefer-256-bit,-prefer-mask-registers < %s | FileCheck %s --check-prefixes=X64-AVX512F-256
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 -mtriple=x86_64-unknown-unknown -mattr=avx512f,-prefer-256-bit,-prefer-mask-registers < %s | FileCheck %s --check-prefixes=X64-AVX512F
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 -mtriple=x86_64-unknown-unknown -mattr=avx512f,+prefer-256-bit,+prefer-mask-registers < %s | FileCheck %s --check-prefixes=X64-MIC-AVX2
+; RUN: opt -S -passes=expand-memcmp -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 -mtriple=x86_64-unknown-unknown -mattr=avx512f,-prefer-256-bit,+prefer-mask-registers < %s | FileCheck %s --check-prefixes=X64-MIC-AVX512F
+
+; This tests codegen time inlining/optimization of memcmp
+; rdar://6480398
+
+ at .str = private constant [513 x i8] c"01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901\00", align 1
+
+declare dso_local i32 @memcmp(ptr, ptr, i64)
+
+define i32 @length0(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length0(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
+; X64-NEXT:    ret i32 0
+;
+; X64-SSE41-LABEL: define i32 @length0(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-SSE41-NEXT:    ret i32 0
+;
+; X64-AVX1-LABEL: define i32 @length0(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-AVX1-NEXT:    ret i32 0
+;
+; X64-AVX2-LABEL: define i32 @length0(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-AVX2-NEXT:    ret i32 0
+;
+; X64-AVX512BW-256-LABEL: define i32 @length0(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-AVX512BW-256-NEXT:    ret i32 0
+;
+; X64-AVX512BW-LABEL: define i32 @length0(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-AVX512BW-NEXT:    ret i32 0
+;
+; X64-AVX512F-256-LABEL: define i32 @length0(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-AVX512F-256-NEXT:    ret i32 0
+;
+; X64-AVX512F-LABEL: define i32 @length0(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-AVX512F-NEXT:    ret i32 0
+;
+; X64-MIC-AVX2-LABEL: define i32 @length0(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-MIC-AVX2-NEXT:    ret i32 0
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length0(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-MIC-AVX512F-NEXT:    ret i32 0
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
+  ret i32 %m
+  }
+
+define i1 @length0_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length0_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    ret i1 true
+;
+; X64-SSE41-LABEL: define i1 @length0_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    ret i1 true
+;
+; X64-AVX1-LABEL: define i1 @length0_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    ret i1 true
+;
+; X64-AVX2-LABEL: define i1 @length0_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    ret i1 true
+;
+; X64-AVX512BW-256-LABEL: define i1 @length0_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    ret i1 true
+;
+; X64-AVX512BW-LABEL: define i1 @length0_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    ret i1 true
+;
+; X64-AVX512F-256-LABEL: define i1 @length0_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    ret i1 true
+;
+; X64-AVX512F-LABEL: define i1 @length0_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    ret i1 true
+;
+; X64-MIC-AVX2-LABEL: define i1 @length0_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    ret i1 true
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length0_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    ret i1 true
+;
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length0_lt(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length0_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    ret i1 false
+;
+; X64-SSE41-LABEL: define i1 @length0_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    ret i1 false
+;
+; X64-AVX1-LABEL: define i1 @length0_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    ret i1 false
+;
+; X64-AVX2-LABEL: define i1 @length0_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    ret i1 false
+;
+; X64-AVX512BW-256-LABEL: define i1 @length0_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    ret i1 false
+;
+; X64-AVX512BW-LABEL: define i1 @length0_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    ret i1 false
+;
+; X64-AVX512F-256-LABEL: define i1 @length0_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    ret i1 false
+;
+; X64-AVX512F-LABEL: define i1 @length0_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    ret i1 false
+;
+; X64-MIC-AVX2-LABEL: define i1 @length0_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    ret i1 false
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length0_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    ret i1 false
+;
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length2(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length2(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    ret i32 [[TMP7]]
+;
+; X64-SSE41-LABEL: define i32 @length2(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    ret i32 [[TMP7]]
+;
+; X64-AVX1-LABEL: define i32 @length2(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    ret i32 [[TMP7]]
+;
+; X64-AVX2-LABEL: define i32 @length2(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    ret i32 [[TMP7]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length2(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[TMP7]]
+;
+; X64-AVX512BW-LABEL: define i32 @length2(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    ret i32 [[TMP7]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length2(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    ret i32 [[TMP7]]
+;
+; X64-AVX512F-LABEL: define i32 @length2(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    ret i32 [[TMP7]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length2(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[TMP7]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length2(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[TMP7]]
+;
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+  ret i32 %m
+}
+
+define i1 @length2_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length2_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length2_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length2_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length2_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length2_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length2_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length2_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length2_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length2_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length2_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length2_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length2_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length2_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length2_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length2_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length2_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length2_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length2_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length2_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length2_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_gt(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length2_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length2_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length2_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length2_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length2_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length2_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length2_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length2_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length2_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length2_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_const(ptr %X) nounwind {
+; X64-LABEL: define i1 @length2_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-NEXT:    ret i1 [[TMP2]]
+;
+; X64-SSE41-LABEL: define i1 @length2_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX1-LABEL: define i1 @length2_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX2-LABEL: define i1 @length2_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length2_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512BW-LABEL: define i1 @length2_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length2_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512F-LABEL: define i1 @length2_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP2]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length2_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP2]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length2_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP2]]
+;
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR3:[0-9]+]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR4:[0-9]+]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR4:[0-9]+]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR4:[0-9]+]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR4:[0-9]+]]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR4:[0-9]+]]
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR4:[0-9]+]]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR4:[0-9]+]]
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR4:[0-9]+]]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR4:[0-9]+]]
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind nobuiltin
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length3(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length3(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-SSE41-LABEL: define i32 @length3(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br label [[ENDBLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length3(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length3(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length3(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-LABEL: define i32 @length3(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length3(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-LABEL: define i32 @length3(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length3(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length3(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+
+
+
+; X64-SSE2:       res_block:
+
+
+
+; X64-SSE2:       loadbb:
+
+
+
+
+
+
+; X64-SSE2:       loadbb1:
+
+
+
+
+
+
+
+
+; X64-SSE2:       endblock:
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
+  ret i32 %m
+}
+
+define i1 @length3_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length3_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-NEXT:    ret i1 [[TMP12]]
+;
+; X64-SSE41-LABEL: define i1 @length3_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX1-LABEL: define i1 @length3_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX2-LABEL: define i1 @length3_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length3_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512BW-LABEL: define i1 @length3_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length3_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512F-LABEL: define i1 @length3_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP12]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length3_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP12]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length3_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP12]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length4(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length4(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-NEXT:    ret i32 [[TMP9]]
+;
+; X64-SSE41-LABEL: define i32 @length4(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-SSE41-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX1-LABEL: define i32 @length4(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX1-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX2-LABEL: define i32 @length4(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX2-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length4(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX512BW-LABEL: define i32 @length4(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX512BW-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length4(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX512F-LABEL: define i32 @length4(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX512F-NEXT:    ret i32 [[TMP9]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length4(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[TMP9]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length4(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[TMP9]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  ret i32 %m
+}
+
+define i1 @length4_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length4_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    ret i1 [[TMP3]]
+;
+; X64-SSE41-LABEL: define i1 @length4_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX1-LABEL: define i1 @length4_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX2-LABEL: define i1 @length4_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length4_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX512BW-LABEL: define i1 @length4_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length4_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX512F-LABEL: define i1 @length4_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP3]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length4_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP3]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length4_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP3]]
+;
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_lt(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length4_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-NEXT:    ret i1 [[TMP5]]
+;
+; X64-SSE41-LABEL: define i1 @length4_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-SSE41-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX1-LABEL: define i1 @length4_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX1-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX2-LABEL: define i1 @length4_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX2-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length4_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX512BW-LABEL: define i1 @length4_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length4_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX512F-LABEL: define i1 @length4_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX512F-NEXT:    ret i1 [[TMP5]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length4_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP5]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length4_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP5]]
+;
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_gt(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length4_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-NEXT:    ret i1 [[TMP5]]
+;
+; X64-SSE41-LABEL: define i1 @length4_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-SSE41-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX1-LABEL: define i1 @length4_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX1-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX2-LABEL: define i1 @length4_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX2-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length4_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX512BW-LABEL: define i1 @length4_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length4_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP5]]
+;
+; X64-AVX512F-LABEL: define i1 @length4_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX512F-NEXT:    ret i1 [[TMP5]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length4_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP5]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length4_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP5]]
+;
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_eq_const(ptr %X) nounwind {
+; X64-LABEL: define i1 @length4_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length4_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length4_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length4_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length4_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length4_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length4_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length4_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length4_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length4_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 4) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length5(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length5(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-SSE41-LABEL: define i32 @length5(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br label [[ENDBLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length5(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length5(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length5(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-LABEL: define i32 @length5(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length5(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-LABEL: define i32 @length5(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length5(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length5(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+
+
+
+; X64-SSE2:       res_block:
+
+
+
+; X64-SSE2:       loadbb:
+
+
+
+
+
+
+; X64-SSE2:       loadbb1:
+
+
+
+
+
+
+
+
+; X64-SSE2:       endblock:
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
+  ret i32 %m
+}
+
+define i1 @length5_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length5_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-NEXT:    ret i1 [[TMP12]]
+;
+; X64-SSE41-LABEL: define i1 @length5_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX1-LABEL: define i1 @length5_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX2-LABEL: define i1 @length5_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length5_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512BW-LABEL: define i1 @length5_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length5_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512F-LABEL: define i1 @length5_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP12]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length5_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP12]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length5_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP12]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length5_lt(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length5_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length5_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br label [[ENDBLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length5_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length5_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length5_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length5_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length5_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length5_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length5_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length5_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+; X64-SSE2:       res_block:
+
+
+
+; X64-SSE2:       loadbb:
+
+
+
+
+
+
+; X64-SSE2:       loadbb1:
+
+
+
+
+
+
+
+
+; X64-SSE2:       endblock:
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length7(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length7(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-SSE41-LABEL: define i32 @length7(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length7(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length7(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length7(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-LABEL: define i32 @length7(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length7(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-LABEL: define i32 @length7(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length7(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length7(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+
+
+
+; X64-SSE2:       res_block:
+
+
+
+
+
+; X64-SSE2:       loadbb:
+
+
+
+
+
+
+; X64-SSE2:       loadbb1:
+
+
+
+
+
+
+
+
+; X64-SSE2:       endblock:
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
+  ret i32 %m
+}
+
+define i1 @length7_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length7_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X64-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-NEXT:    ret i1 [[TMP10]]
+;
+; X64-SSE41-LABEL: define i1 @length7_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX1-LABEL: define i1 @length7_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX2-LABEL: define i1 @length7_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length7_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512BW-LABEL: define i1 @length7_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length7_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512F-LABEL: define i1 @length7_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP10]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length7_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP10]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length7_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP10]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length7_lt(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length7_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length7_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length7_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length7_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length7_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length7_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length7_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length7_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length7_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length7_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+; X64-SSE2:       res_block:
+
+
+
+
+
+; X64-SSE2:       loadbb:
+
+
+
+
+
+
+; X64-SSE2:       loadbb1:
+
+
+
+
+
+
+
+
+; X64-SSE2:       endblock:
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length8(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length8(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-NEXT:    ret i32 [[TMP9]]
+;
+; X64-SSE41-LABEL: define i32 @length8(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-SSE41-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX1-LABEL: define i32 @length8(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX1-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX2-LABEL: define i32 @length8(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX2-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length8(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX512BW-LABEL: define i32 @length8(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX512BW-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length8(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX512F-LABEL: define i32 @length8(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX512F-NEXT:    ret i32 [[TMP9]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length8(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[TMP9]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length8(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[TMP9]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
+  ret i32 %m
+}
+
+define i1 @length8_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length8_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length8_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length8_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length8_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length8_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length8_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length8_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length8_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length8_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length8_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length8_eq_const(ptr %X) nounwind {
+; X64-LABEL: define i1 @length8_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-NEXT:    ret i1 [[TMP2]]
+;
+; X64-SSE41-LABEL: define i1 @length8_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX1-LABEL: define i1 @length8_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX2-LABEL: define i1 @length8_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length8_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512BW-LABEL: define i1 @length8_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length8_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512F-LABEL: define i1 @length8_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP2]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length8_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP2]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length8_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP2]]
+;
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 8) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length9_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length9_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; X64-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; X64-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length9_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length9_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length9_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length9_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length9_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length9_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length9_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length9_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length9_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length10_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length10_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; X64-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; X64-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length10_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length10_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length10_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length10_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length10_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length10_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length10_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length10_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length10_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 10) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length11_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length11_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length11_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length11_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length11_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length11_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length11_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length11_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length11_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length11_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length11_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 11) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length12_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length12_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-NEXT:    ret i1 [[TMP12]]
+;
+; X64-SSE41-LABEL: define i1 @length12_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX1-LABEL: define i1 @length12_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX2-LABEL: define i1 @length12_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length12_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512BW-LABEL: define i1 @length12_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length12_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512F-LABEL: define i1 @length12_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP12]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length12_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP12]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length12_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP12]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length12(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length12(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-SSE41-LABEL: define i32 @length12(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-SSE41-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-SSE41-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length12(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-AVX1-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-AVX1-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length12(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-AVX2-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-AVX2-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length12(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-AVX512BW-256-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-LABEL: define i32 @length12(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-AVX512BW-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length12(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-AVX512F-256-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-LABEL: define i32 @length12(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-AVX512F-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-AVX512F-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length12(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-MIC-AVX2-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length12(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-MIC-AVX512F-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+
+
+
+; X64-SSE2:       res_block:
+
+
+
+
+
+; X64-SSE2:       loadbb:
+
+
+
+
+
+
+; X64-SSE2:       loadbb1:
+
+
+
+
+
+
+
+
+
+
+; X64-SSE2:       endblock:
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
+  ret i32 %m
+}
+
+define i1 @length13_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length13_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length13_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length13_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length13_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length13_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length13_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length13_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length13_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length13_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length13_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 13) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length14_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length14_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length14_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length14_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length14_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length14_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length14_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length14_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length14_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length14_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length14_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 14) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length15_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @length15_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length15_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length15_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length15_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length15_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length15_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length15_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length15_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length15_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length15_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
+
+define i32 @length16(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length16(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-SSE41-LABEL: define i32 @length16(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length16(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length16(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length16(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-LABEL: define i32 @length16(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length16(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-LABEL: define i32 @length16(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length16(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length16(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+
+
+
+; X64-SSE2:       res_block:
+
+
+
+
+
+; X64-SSE2:       loadbb:
+
+
+
+
+
+
+; X64-SSE2:       loadbb1:
+
+
+
+
+
+
+
+
+; X64-SSE2:       endblock:
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 16) nounwind
+  ret i32 %m
+}
+
+define i1 @length16_eq(ptr %x, ptr %y) nounwind {
+;
+; X64-LABEL: define i1 @length16_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    ret i1 [[TMP3]]
+;
+; X64-SSE41-LABEL: define i1 @length16_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX1-LABEL: define i1 @length16_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX2-LABEL: define i1 @length16_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length16_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX512BW-LABEL: define i1 @length16_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length16_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX512F-LABEL: define i1 @length16_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP3]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length16_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP3]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length16_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX-LABEL: length16_eq:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    setne %al
+; X64-AVX-NEXT:    retq
+; X64-MIC-AVX-LABEL: length16_eq:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %xmm1
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k0
+; X64-MIC-AVX-NEXT:    setne %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length16_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length16_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length16_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length16_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length16_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length16_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length16_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length16_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length16_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length16_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+; X64-SSE2:       res_block:
+
+
+
+
+
+; X64-SSE2:       loadbb:
+
+
+
+
+
+
+; X64-SSE2:       loadbb1:
+
+
+
+
+
+
+
+
+; X64-SSE2:       endblock:
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length16_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length16_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length16_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length16_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length16_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length16_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length16_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length16_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length16_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length16_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+; X64-SSE2:       res_block:
+
+
+
+
+
+; X64-SSE2:       loadbb:
+
+
+
+
+
+
+; X64-SSE2:       loadbb1:
+
+
+
+
+
+
+
+
+; X64-SSE2:       endblock:
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_eq_const(ptr %X) nounwind {
+;
+; X64-LABEL: define i1 @length16_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length16_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length16_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length16_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length16_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length16_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length16_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length16_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length16_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length16_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-AVX-LABEL: length16_eq_const:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    sete %al
+; X64-AVX-NEXT:    retq
+; X64-MIC-AVX-LABEL: length16_eq_const:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [858927408,926299444,825243960,892613426]
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k0
+; X64-MIC-AVX-NEXT:    sete %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 16) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
+
+define i32 @length24(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length24(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-SSE41-LABEL: define i32 @length24(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb2:
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-SSE41-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-SSE41-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-SSE41-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-SSE41-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-SSE41-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length24(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb2:
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX1-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX1-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX1-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length24(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb2:
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX2-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length24(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb2:
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-LABEL: define i32 @length24(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb2:
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length24(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb2:
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-LABEL: define i32 @length24(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb2:
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length24(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb2:
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length24(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb2:
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+
+
+
+; X64-SSE2:       res_block:
+
+
+
+
+
+; X64-SSE2:       loadbb:
+
+
+
+
+
+
+; X64-SSE2:       loadbb1:
+
+
+
+
+
+
+
+
+; X64-SSE2:       loadbb2:
+
+
+
+
+
+
+
+
+; X64-SSE2:       endblock:
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 24) nounwind
+  ret i32 %m
+}
+
+define i1 @length24_eq(ptr %x, ptr %y) nounwind {
+;
+; X64-LABEL: define i1 @length24_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length24_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length24_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length24_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length24_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length24_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length24_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length24_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length24_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length24_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX-LABEL: length24_eq:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; X64-AVX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; X64-AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
+; X64-AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    sete %al
+; X64-AVX-NEXT:    retq
+; X64-MIC-AVX-LABEL: length24_eq:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %xmm1
+; X64-MIC-AVX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; X64-MIC-AVX-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm2, %k0
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
+; X64-MIC-AVX-NEXT:    sete %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length24_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length24_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb2:
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-SSE41-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-SSE41-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-SSE41-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-SSE41-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-SSE41-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length24_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb2:
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX1-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX1-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX1-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length24_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb2:
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX2-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length24_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb2:
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length24_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb2:
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length24_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb2:
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length24_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb2:
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length24_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb2:
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length24_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb2:
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+; X64-SSE2:       res_block:
+
+
+
+
+
+; X64-SSE2:       loadbb:
+
+
+
+
+
+
+; X64-SSE2:       loadbb1:
+
+
+
+
+
+
+
+
+; X64-SSE2:       loadbb2:
+
+
+
+
+
+
+
+
+; X64-SSE2:       endblock:
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length24_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length24_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb2:
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-SSE41-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-SSE41-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-SSE41-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-SSE41-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-SSE41-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length24_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb2:
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX1-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX1-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX1-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length24_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb2:
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX2-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length24_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb2:
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length24_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb2:
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length24_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb2:
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length24_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb2:
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length24_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb2:
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length24_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb2:
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP21]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB2]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+; X64-SSE2:       res_block:
+
+
+
+
+
+; X64-SSE2:       loadbb:
+
+
+
+
+
+
+; X64-SSE2:       loadbb1:
+
+
+
+
+
+
+
+
+; X64-SSE2:       loadbb2:
+
+
+
+
+
+
+
+
+; X64-SSE2:       endblock:
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_eq_const(ptr %X) nounwind {
+;
+; X64-LABEL: define i1 @length24_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-NEXT:    ret i1 [[TMP8]]
+;
+; X64-SSE41-LABEL: define i1 @length24_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX1-LABEL: define i1 @length24_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX2-LABEL: define i1 @length24_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length24_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX512BW-LABEL: define i1 @length24_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length24_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX512F-LABEL: define i1 @length24_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP8]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length24_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP8]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length24_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX-LABEL: length24_eq_const:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    setne %al
+; X64-AVX-NEXT:    retq
+; X64-MIC-AVX-LABEL: length24_eq_const:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-MIC-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [959985462,858927408,0,0]
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm1, %k0
+; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [858927408,926299444,825243960,892613426]
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
+; X64-MIC-AVX-NEXT:    setne %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 24) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length31(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length31(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64:       loadbb3:
+; X64-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-SSE41-LABEL: define i32 @length31(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb2:
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-SSE41-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-SSE41-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-SSE41-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-SSE41-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-SSE41-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb3:
+; X64-SSE41-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-SSE41-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-SSE41-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-SSE41-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-SSE41-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-SSE41-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-SSE41-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-SSE41-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length31(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb2:
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX1-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX1-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX1-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb3:
+; X64-AVX1-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX1-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX1-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX1-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX1-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX1-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX1-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length31(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb2:
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb3:
+; X64-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length31(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb2:
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb3:
+; X64-AVX512BW-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512BW-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512BW-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-LABEL: define i32 @length31(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb2:
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb3:
+; X64-AVX512BW-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512BW-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512BW-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length31(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb2:
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb3:
+; X64-AVX512F-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512F-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512F-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-LABEL: define i32 @length31(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb2:
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb3:
+; X64-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length31(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb2:
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb3:
+; X64-MIC-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-MIC-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-MIC-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length31(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb2:
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb3:
+; X64-MIC-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-MIC-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-MIC-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+
+
+
+; X64-SSE2:       res_block:
+
+
+
+
+
+; X64-SSE2:       loadbb:
+
+
+
+
+
+
+; X64-SSE2:       loadbb1:
+
+
+
+
+
+
+
+
+; X64-SSE2:       loadbb2:
+
+
+
+
+
+
+
+
+; X64-SSE2:       loadbb3:
+
+
+
+
+
+
+
+
+; X64-SSE2:       endblock:
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 31) nounwind
+  ret i32 %m
+}
+
+define i1 @length31_eq(ptr %x, ptr %y) nounwind {
+;
+; X64-LABEL: define i1 @length31_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length31_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length31_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length31_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length31_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length31_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length31_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length31_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length31_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length31_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX-LABEL: length31_eq:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
+; X64-AVX-NEXT:    vpxor 15(%rsi), %xmm1, %xmm1
+; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
+; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    sete %al
+; X64-AVX-NEXT:    retq
+; X64-MIC-AVX-LABEL: length31_eq:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-MIC-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
+; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %xmm2
+; X64-MIC-AVX-NEXT:    vmovdqu 15(%rsi), %xmm3
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm1, %k0
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm0, %k1
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
+; X64-MIC-AVX-NEXT:    sete %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length31_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64:       loadbb3:
+; X64-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length31_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb2:
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-SSE41-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-SSE41-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-SSE41-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-SSE41-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-SSE41-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb3:
+; X64-SSE41-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-SSE41-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-SSE41-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-SSE41-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-SSE41-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-SSE41-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-SSE41-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-SSE41-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length31_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb2:
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX1-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX1-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX1-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb3:
+; X64-AVX1-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX1-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX1-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX1-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX1-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX1-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX1-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length31_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb2:
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb3:
+; X64-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length31_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb2:
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb3:
+; X64-AVX512BW-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512BW-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512BW-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length31_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb2:
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb3:
+; X64-AVX512BW-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512BW-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512BW-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length31_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb2:
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb3:
+; X64-AVX512F-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512F-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512F-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length31_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb2:
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb3:
+; X64-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length31_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb2:
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb3:
+; X64-MIC-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-MIC-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-MIC-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length31_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb2:
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb3:
+; X64-MIC-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-MIC-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-MIC-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+; X64-SSE2:       res_block:
+
+
+
+
+
+; X64-SSE2:       loadbb:
+
+
+
+
+
+
+; X64-SSE2:       loadbb1:
+
+
+
+
+
+
+
+
+; X64-SSE2:       loadbb2:
+
+
+
+
+
+
+
+
+; X64-SSE2:       loadbb3:
+
+
+
+
+
+
+
+
+; X64-SSE2:       endblock:
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length31_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64:       loadbb3:
+; X64-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length31_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb2:
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-SSE41-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-SSE41-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-SSE41-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-SSE41-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-SSE41-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb3:
+; X64-SSE41-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-SSE41-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-SSE41-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-SSE41-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-SSE41-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-SSE41-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-SSE41-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-SSE41-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length31_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb2:
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX1-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX1-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX1-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb3:
+; X64-AVX1-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX1-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX1-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX1-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX1-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX1-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX1-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length31_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb2:
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb3:
+; X64-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length31_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb2:
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb3:
+; X64-AVX512BW-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512BW-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512BW-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length31_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb2:
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb3:
+; X64-AVX512BW-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512BW-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512BW-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length31_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb2:
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb3:
+; X64-AVX512F-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512F-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512F-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length31_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb2:
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb3:
+; X64-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length31_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb2:
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb3:
+; X64-MIC-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-MIC-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-MIC-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length31_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb2:
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb3:
+; X64-MIC-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 23
+; X64-MIC-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 23
+; X64-MIC-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+; X64-SSE2:       res_block:
+
+
+
+
+
+; X64-SSE2:       loadbb:
+
+
+
+
+
+
+; X64-SSE2:       loadbb1:
+
+
+
+
+
+
+
+
+; X64-SSE2:       loadbb2:
+
+
+
+
+
+
+
+
+; X64-SSE2:       loadbb3:
+
+
+
+
+
+
+
+
+; X64-SSE2:       endblock:
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
+;
+; X64-LABEL: define i1 @length31_eq_prefer128(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length31_eq_prefer128(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length31_eq_prefer128(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length31_eq_prefer128(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length31_eq_prefer128(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length31_eq_prefer128(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length31_eq_prefer128(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length31_eq_prefer128(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length31_eq_prefer128(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length31_eq_prefer128(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX-LABEL: length31_eq_prefer128:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
+; X64-AVX-NEXT:    vpxor 15(%rsi), %xmm1, %xmm1
+; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
+; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    sete %al
+; X64-AVX-NEXT:    retq
+; X64-MIC-AVX-LABEL: length31_eq_prefer128:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-MIC-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
+; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %xmm2
+; X64-MIC-AVX-NEXT:    vmovdqu 15(%rsi), %xmm3
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm1, %k0
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm0, %k1
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
+; X64-MIC-AVX-NEXT:    sete %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_eq_const(ptr %X) nounwind {
+;
+; X64-LABEL: define i1 @length31_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X64-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-NEXT:    ret i1 [[TMP7]]
+;
+; X64-SSE41-LABEL: define i1 @length31_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP7]]
+;
+; X64-AVX1-LABEL: define i1 @length31_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP7]]
+;
+; X64-AVX2-LABEL: define i1 @length31_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP7]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length31_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP7]]
+;
+; X64-AVX512BW-LABEL: define i1 @length31_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP7]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length31_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP7]]
+;
+; X64-AVX512F-LABEL: define i1 @length31_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP7]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length31_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP7]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length31_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP7]]
+;
+; X64-AVX-LABEL: length31_eq_const:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
+; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    setne %al
+; X64-AVX-NEXT:    retq
+; X64-MIC-AVX-LABEL: length31_eq_const:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-MIC-AVX-NEXT:    vmovdqu 15(%rdi), %xmm1
+; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [943142453,842084409,909456435,809056311]
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm1, %k0
+; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [858927408,926299444,825243960,892613426]
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
+; X64-MIC-AVX-NEXT:    setne %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 31) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length32(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length32(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64:       loadbb3:
+; X64-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-SSE41-LABEL: define i32 @length32(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb2:
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-SSE41-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-SSE41-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-SSE41-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-SSE41-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-SSE41-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb3:
+; X64-SSE41-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-SSE41-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-SSE41-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-SSE41-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-SSE41-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-SSE41-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-SSE41-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-SSE41-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length32(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb2:
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX1-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX1-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX1-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb3:
+; X64-AVX1-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX1-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX1-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX1-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX1-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX1-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX1-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length32(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb2:
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb3:
+; X64-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length32(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb2:
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb3:
+; X64-AVX512BW-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512BW-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512BW-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512BW-LABEL: define i32 @length32(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb2:
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb3:
+; X64-AVX512BW-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512BW-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512BW-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length32(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb2:
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb3:
+; X64-AVX512F-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512F-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512F-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX512F-LABEL: define i32 @length32(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb2:
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb3:
+; X64-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length32(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb2:
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb3:
+; X64-MIC-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-MIC-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-MIC-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length32(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb2:
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb3:
+; X64-MIC-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-MIC-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-MIC-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[PHI_RES]]
+;
+
+
+
+; X64-SSE2:       res_block:
+
+
+
+
+
+; X64-SSE2:       loadbb:
+
+
+
+
+
+
+; X64-SSE2:       loadbb1:
+
+
+
+
+
+
+
+
+; X64-SSE2:       loadbb2:
+
+
+
+
+
+
+
+
+; X64-SSE2:       loadbb3:
+
+
+
+
+
+
+
+
+; X64-SSE2:       endblock:
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 32) nounwind
+  ret i32 %m
+}
+
+; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
+
+define i1 @length32_eq(ptr %x, ptr %y) nounwind {
+;
+; X64-LABEL: define i1 @length32_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length32_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length32_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = icmp ne i256 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length32_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i256 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length32_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = icmp ne i256 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length32_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = icmp ne i256 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length32_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = icmp ne i256 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length32_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i256 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length32_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i256 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length32_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i256 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512-LABEL: length32_eq:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX512-NEXT:    vpxor (%rsi), %ymm0, %ymm0
+; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
+; X64-AVX512-NEXT:    sete %al
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+; X64-MIC-AVX-LABEL: length32_eq:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %ymm1
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k0
+; X64-MIC-AVX-NEXT:    sete %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length32_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64:       loadbb3:
+; X64-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length32_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb2:
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-SSE41-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-SSE41-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-SSE41-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-SSE41-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-SSE41-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb3:
+; X64-SSE41-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-SSE41-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-SSE41-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-SSE41-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-SSE41-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-SSE41-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-SSE41-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-SSE41-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length32_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb2:
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX1-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX1-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX1-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb3:
+; X64-AVX1-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX1-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX1-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX1-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX1-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX1-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX1-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length32_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb2:
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb3:
+; X64-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length32_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb2:
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb3:
+; X64-AVX512BW-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512BW-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512BW-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length32_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb2:
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb3:
+; X64-AVX512BW-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512BW-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512BW-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length32_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb2:
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb3:
+; X64-AVX512F-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512F-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512F-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length32_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb2:
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb3:
+; X64-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length32_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb2:
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb3:
+; X64-MIC-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-MIC-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-MIC-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length32_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb2:
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb3:
+; X64-MIC-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-MIC-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-MIC-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+; X64-SSE2:       res_block:
+
+
+
+
+
+; X64-SSE2:       loadbb:
+
+
+
+
+
+
+; X64-SSE2:       loadbb1:
+
+
+
+
+
+
+
+
+; X64-SSE2:       loadbb2:
+
+
+
+
+
+
+
+
+; X64-SSE2:       loadbb3:
+
+
+
+
+
+
+
+
+; X64-SSE2:       endblock:
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length32_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64:       loadbb2:
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64:       loadbb3:
+; X64-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length32_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X64-SSE41:       res_block:
+; X64-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-SSE41:       loadbb:
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-SSE41-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-SSE41:       loadbb1:
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-SSE41-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb2:
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-SSE41-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-SSE41-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-SSE41-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-SSE41-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-SSE41-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-SSE41:       loadbb3:
+; X64-SSE41-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-SSE41-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-SSE41-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-SSE41-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-SSE41-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-SSE41-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-SSE41-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-SSE41-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-SSE41:       endblock:
+; X64-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length32_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb2:
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX1-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX1-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX1-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX1-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX1:       loadbb3:
+; X64-AVX1-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX1-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX1-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX1-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX1-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX1-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX1-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length32_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb2:
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX2:       loadbb3:
+; X64-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length32_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW-256:       res_block:
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb:
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW-256:       loadbb1:
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb2:
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       loadbb3:
+; X64-AVX512BW-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512BW-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512BW-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW-256:       endblock:
+; X64-AVX512BW-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length32_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512BW:       res_block:
+; X64-AVX512BW-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512BW-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512BW-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512BW:       loadbb:
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512BW-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512BW:       loadbb1:
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512BW-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb2:
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512BW-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512BW-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512BW-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512BW:       loadbb3:
+; X64-AVX512BW-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512BW-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512BW-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512BW-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512BW-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512BW-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512BW-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512BW-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512BW:       endblock:
+; X64-AVX512BW-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length32_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F-256:       res_block:
+; X64-AVX512F-256-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-256-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-256-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F-256:       loadbb:
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-256-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F-256:       loadbb1:
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-256-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb2:
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-256-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-256-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       loadbb3:
+; X64-AVX512F-256-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512F-256-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512F-256-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-256-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-256-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-256-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F-256:       endblock:
+; X64-AVX512F-256-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length32_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX512F:       res_block:
+; X64-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX512F:       loadbb:
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX512F:       loadbb1:
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb2:
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-AVX512F:       loadbb3:
+; X64-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX512F:       endblock:
+; X64-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length32_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX2:       res_block:
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb:
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX2:       loadbb1:
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb2:
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX2-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX2-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       loadbb3:
+; X64-MIC-AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-MIC-AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-MIC-AVX2-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX2-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX2-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX2-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX2:       endblock:
+; X64-MIC-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length32_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    br label [[LOADBB:%.*]]
+; X64-MIC-AVX512F:       res_block:
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ], [ [[TMP19:%.*]], [[LOADBB2:%.*]] ], [ [[TMP26:%.*]], [[LOADBB3:%.*]] ]
+; X64-MIC-AVX512F-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ], [ [[TMP20:%.*]], [[LOADBB2]] ], [ [[TMP27:%.*]], [[LOADBB3]] ]
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-MIC-AVX512F-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb:
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-MIC-AVX512F-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-MIC-AVX512F:       loadbb1:
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-MIC-AVX512F-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP14]], label [[LOADBB2]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb2:
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP15]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = load i64, ptr [[TMP16]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP19]] = call i64 @llvm.bswap.i64(i64 [[TMP17]])
+; X64-MIC-AVX512F-NEXT:    [[TMP20]] = call i64 @llvm.bswap.i64(i64 [[TMP18]])
+; X64-MIC-AVX512F-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP21]], label [[LOADBB3]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       loadbb3:
+; X64-MIC-AVX512F-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[X]], i64 24
+; X64-MIC-AVX512F-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[Y]], i64 24
+; X64-MIC-AVX512F-NEXT:    [[TMP24:%.*]] = load i64, ptr [[TMP22]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP25:%.*]] = load i64, ptr [[TMP23]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP26]] = call i64 @llvm.bswap.i64(i64 [[TMP24]])
+; X64-MIC-AVX512F-NEXT:    [[TMP27]] = call i64 @llvm.bswap.i64(i64 [[TMP25]])
+; X64-MIC-AVX512F-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[TMP26]], [[TMP27]]
+; X64-MIC-AVX512F-NEXT:    br i1 [[TMP28]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-MIC-AVX512F:       endblock:
+; X64-MIC-AVX512F-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB3]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[PHI_RES]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+; X64-SSE2:       res_block:
+
+
+
+
+
+; X64-SSE2:       loadbb:
+
+
+
+
+
+
+; X64-SSE2:       loadbb1:
+
+
+
+
+
+
+
+
+; X64-SSE2:       loadbb2:
+
+
+
+
+
+
+
+
+; X64-SSE2:       loadbb3:
+
+
+
+
+
+
+
+
+; X64-SSE2:       endblock:
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
+;
+; X64-LABEL: define i1 @length32_eq_prefer128(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length32_eq_prefer128(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length32_eq_prefer128(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length32_eq_prefer128(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length32_eq_prefer128(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length32_eq_prefer128(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length32_eq_prefer128(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length32_eq_prefer128(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length32_eq_prefer128(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length32_eq_prefer128(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX-LABEL: length32_eq_prefer128:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovdqu 16(%rdi), %xmm1
+; X64-AVX-NEXT:    vpxor 16(%rsi), %xmm1, %xmm1
+; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
+; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    sete %al
+; X64-AVX-NEXT:    retq
+; X64-MIC-AVX-LABEL: length32_eq_prefer128:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-MIC-AVX-NEXT:    vmovdqu 16(%rdi), %xmm1
+; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %xmm2
+; X64-MIC-AVX-NEXT:    vmovdqu 16(%rsi), %xmm3
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm1, %k0
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm0, %k1
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
+; X64-MIC-AVX-NEXT:    sete %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_const(ptr %X) nounwind {
+;
+; X64-LABEL: define i1 @length32_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X64-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-NEXT:    ret i1 [[TMP7]]
+;
+; X64-SSE41-LABEL: define i1 @length32_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP7]]
+;
+; X64-AVX1-LABEL: define i1 @length32_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = icmp ne i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX2-LABEL: define i1 @length32_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length32_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = icmp ne i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512BW-LABEL: define i1 @length32_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = icmp ne i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length32_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = icmp ne i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512F-LABEL: define i1 @length32_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = icmp ne i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP2]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length32_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP2]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length32_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = icmp ne i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX512-LABEL: length32_eq_const:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
+; X64-AVX512-NEXT:    setne %al
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+; X64-MIC-AVX-LABEL: length32_eq_const:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [858927408,926299444,825243960,892613426,959985462,858927408,926299444,825243960]
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k0
+; X64-MIC-AVX-NEXT:    setne %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 32) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length48(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length48(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length48(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5:[0-9]+]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length48(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5:[0-9]+]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length48(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5:[0-9]+]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length48(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5:[0-9]+]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length48(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5:[0-9]+]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length48(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5:[0-9]+]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length48(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5:[0-9]+]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length48(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5:[0-9]+]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length48(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5:[0-9]+]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 48) nounwind
+  ret i32 %m
+}
+
+define i1 @length48_eq(ptr %x, ptr %y) nounwind {
+;
+; X64-LABEL: define i1 @length48_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length48_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length48_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i128 [[TMP6]] to i256
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i128 [[TMP7]] to i256
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = xor i256 [[TMP8]], [[TMP9]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = or i256 [[TMP3]], [[TMP10]]
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = icmp ne i256 [[TMP11]], 0
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length48_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i128 [[TMP6]] to i256
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i128 [[TMP7]] to i256
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = xor i256 [[TMP8]], [[TMP9]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = or i256 [[TMP3]], [[TMP10]]
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i256 [[TMP11]], 0
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length48_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i128 [[TMP6]] to i256
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = zext i128 [[TMP7]] to i256
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = xor i256 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = or i256 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = icmp ne i256 [[TMP11]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length48_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i128 [[TMP6]] to i256
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = zext i128 [[TMP7]] to i256
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = xor i256 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = or i256 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = icmp ne i256 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length48_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i128 [[TMP6]] to i256
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = zext i128 [[TMP7]] to i256
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = xor i256 [[TMP8]], [[TMP9]]
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = or i256 [[TMP3]], [[TMP10]]
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = icmp ne i256 [[TMP11]], 0
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length48_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i128 [[TMP6]] to i256
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = zext i128 [[TMP7]] to i256
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = xor i256 [[TMP8]], [[TMP9]]
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = or i256 [[TMP3]], [[TMP10]]
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i256 [[TMP11]], 0
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length48_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i128 [[TMP6]] to i256
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = zext i128 [[TMP7]] to i256
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = xor i256 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = or i256 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i256 [[TMP11]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length48_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i128 [[TMP6]] to i256
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = zext i128 [[TMP7]] to i256
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = xor i256 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = or i256 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i256 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512-LABEL: length48_eq:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX512-NEXT:    vmovdqu 32(%rdi), %xmm1
+; X64-AVX512-NEXT:    vmovdqu 32(%rsi), %xmm2
+; X64-AVX512-NEXT:    vpxor (%rsi), %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpxor %ymm2, %ymm1, %ymm1
+; X64-AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
+; X64-AVX512-NEXT:    sete %al
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+; X64-MIC-AVX-LABEL: length48_eq:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %ymm1
+; X64-MIC-AVX-NEXT:    vmovdqu 32(%rdi), %xmm2
+; X64-MIC-AVX-NEXT:    vmovdqu 32(%rsi), %xmm3
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm2, %k0
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
+; X64-MIC-AVX-NEXT:    sete %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length48_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length48_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length48_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length48_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length48_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length48_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length48_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length48_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length48_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length48_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length48_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length48_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length48_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length48_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length48_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length48_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length48_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length48_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length48_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length48_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 48) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
+;
+; X64-LABEL: define i1 @length48_eq_prefer128(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length48_eq_prefer128(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length48_eq_prefer128(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length48_eq_prefer128(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length48_eq_prefer128(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length48_eq_prefer128(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length48_eq_prefer128(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length48_eq_prefer128(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length48_eq_prefer128(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length48_eq_prefer128(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = icmp ne i128 [[TMP15]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX-LABEL: length48_eq_prefer128:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovdqu 16(%rdi), %xmm1
+; X64-AVX-NEXT:    vmovdqu 32(%rdi), %xmm2
+; X64-AVX-NEXT:    vpxor 16(%rsi), %xmm1, %xmm1
+; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
+; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpxor 32(%rsi), %xmm2, %xmm1
+; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    sete %al
+; X64-AVX-NEXT:    retq
+; X64-MIC-AVX-LABEL: length48_eq_prefer128:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-MIC-AVX-NEXT:    vmovdqu 16(%rdi), %xmm1
+; X64-MIC-AVX-NEXT:    vmovdqu 32(%rdi), %xmm2
+; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %xmm3
+; X64-MIC-AVX-NEXT:    vmovdqu 16(%rsi), %xmm4
+; X64-MIC-AVX-NEXT:    vmovdqu 32(%rsi), %xmm5
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm4, %zmm1, %k0
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm0, %k1
+; X64-MIC-AVX-NEXT:    korw %k0, %k1, %k0
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm5, %zmm2, %k1
+; X64-MIC-AVX-NEXT:    kortestw %k1, %k0
+; X64-MIC-AVX-NEXT:    sete %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_eq_const(ptr %X) nounwind {
+;
+; X64-LABEL: define i1 @length48_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X64-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP6]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP7]], 73389002901949112059321871464991568690
+; X64-NEXT:    [[TMP9:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-NEXT:    [[TMP10:%.*]] = or i128 [[TMP9]], [[TMP8]]
+; X64-NEXT:    [[TMP11:%.*]] = icmp ne i128 [[TMP10]], 0
+; X64-NEXT:    [[TMP12:%.*]] = zext i1 [[TMP11]] to i32
+; X64-NEXT:    ret i1 [[TMP11]]
+;
+; X64-SSE41-LABEL: define i1 @length48_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP6]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP7]], 73389002901949112059321871464991568690
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = or i128 [[TMP9]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = icmp ne i128 [[TMP10]], 0
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = zext i1 [[TMP11]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP11]]
+;
+; X64-AVX1-LABEL: define i1 @length48_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = zext i128 [[TMP4]] to i256
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = xor i256 [[TMP5]], 73389002901949112059321871464991568690
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = or i256 [[TMP2]], [[TMP6]]
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = icmp ne i256 [[TMP7]], 0
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX2-LABEL: define i1 @length48_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = zext i128 [[TMP4]] to i256
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = xor i256 [[TMP5]], 73389002901949112059321871464991568690
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = or i256 [[TMP2]], [[TMP6]]
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = icmp ne i256 [[TMP7]], 0
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length48_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = zext i128 [[TMP4]] to i256
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = xor i256 [[TMP5]], 73389002901949112059321871464991568690
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = or i256 [[TMP2]], [[TMP6]]
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = icmp ne i256 [[TMP7]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX512BW-LABEL: define i1 @length48_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = zext i128 [[TMP4]] to i256
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = xor i256 [[TMP5]], 73389002901949112059321871464991568690
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = or i256 [[TMP2]], [[TMP6]]
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = icmp ne i256 [[TMP7]], 0
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length48_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = zext i128 [[TMP4]] to i256
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = xor i256 [[TMP5]], 73389002901949112059321871464991568690
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = or i256 [[TMP2]], [[TMP6]]
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = icmp ne i256 [[TMP7]], 0
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX512F-LABEL: define i1 @length48_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = zext i128 [[TMP4]] to i256
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = xor i256 [[TMP5]], 73389002901949112059321871464991568690
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = or i256 [[TMP2]], [[TMP6]]
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = icmp ne i256 [[TMP7]], 0
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP8]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length48_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = zext i128 [[TMP4]] to i256
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = xor i256 [[TMP5]], 73389002901949112059321871464991568690
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = or i256 [[TMP2]], [[TMP6]]
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = icmp ne i256 [[TMP7]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP8]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length48_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = zext i128 [[TMP4]] to i256
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = xor i256 [[TMP5]], 73389002901949112059321871464991568690
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = or i256 [[TMP2]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = icmp ne i256 [[TMP7]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX512-LABEL: length48_eq_const:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX512-NEXT:    vmovdqu 32(%rdi), %xmm1
+; X64-AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; X64-AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
+; X64-AVX512-NEXT:    setne %al
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+; X64-MIC-AVX-LABEL: length48_eq_const:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-MIC-AVX-NEXT:    vmovdqu 32(%rdi), %xmm1
+; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} ymm2 = [892613426,959985462,858927408,926299444,0,0,0,0]
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm1, %k0
+; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [858927408,926299444,825243960,892613426,959985462,858927408,926299444,825243960]
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
+; X64-MIC-AVX-NEXT:    setne %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 48) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length63(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length63(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length63(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length63(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length63(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length63(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length63(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length63(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length63(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length63(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length63(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 63) nounwind
+  ret i32 %m
+}
+
+define i1 @length63_eq(ptr %x, ptr %y) nounwind {
+;
+; X64-LABEL: define i1 @length63_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 47
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 47
+; X64-NEXT:    [[TMP16:%.*]] = load i128, ptr [[TMP14]], align 1
+; X64-NEXT:    [[TMP17:%.*]] = load i128, ptr [[TMP15]], align 1
+; X64-NEXT:    [[TMP18:%.*]] = xor i128 [[TMP16]], [[TMP17]]
+; X64-NEXT:    [[TMP19:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP20:%.*]] = or i128 [[TMP13]], [[TMP18]]
+; X64-NEXT:    [[TMP21:%.*]] = or i128 [[TMP19]], [[TMP20]]
+; X64-NEXT:    [[TMP22:%.*]] = icmp ne i128 [[TMP21]], 0
+; X64-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-NEXT:    ret i1 [[TMP22]]
+;
+; X64-SSE41-LABEL: define i1 @length63_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 47
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 47
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = load i128, ptr [[TMP14]], align 1
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = load i128, ptr [[TMP15]], align 1
+; X64-SSE41-NEXT:    [[TMP18:%.*]] = xor i128 [[TMP16]], [[TMP17]]
+; X64-SSE41-NEXT:    [[TMP19:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP20:%.*]] = or i128 [[TMP13]], [[TMP18]]
+; X64-SSE41-NEXT:    [[TMP21:%.*]] = or i128 [[TMP19]], [[TMP20]]
+; X64-SSE41-NEXT:    [[TMP22:%.*]] = icmp ne i128 [[TMP21]], 0
+; X64-SSE41-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX1-LABEL: define i1 @length63_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 31
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX2-LABEL: define i1 @length63_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 31
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length63_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 31
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512BW-LABEL: define i1 @length63_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 31
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length63_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 31
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512F-LABEL: define i1 @length63_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 31
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP10]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length63_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 31
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP10]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length63_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 31
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512-LABEL: length63_eq:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX512-NEXT:    vmovdqu 31(%rdi), %ymm1
+; X64-AVX512-NEXT:    vpxor 31(%rsi), %ymm1, %ymm1
+; X64-AVX512-NEXT:    vpxor (%rsi), %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
+; X64-AVX512-NEXT:    setne %al
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+; X64-MIC-AVX-LABEL: length63_eq:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-MIC-AVX-NEXT:    vmovdqu 31(%rdi), %ymm1
+; X64-MIC-AVX-NEXT:    vmovdqu (%rsi), %ymm2
+; X64-MIC-AVX-NEXT:    vmovdqu 31(%rsi), %ymm3
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm3, %zmm1, %k0
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm0, %k1
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
+; X64-MIC-AVX-NEXT:    setne %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length63_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length63_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length63_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length63_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length63_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length63_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length63_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length63_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length63_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length63_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length63_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length63_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length63_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length63_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length63_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length63_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length63_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length63_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length63_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length63_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length63_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length63_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 63) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length63_eq_const(ptr %X) nounwind {
+;
+; X64-LABEL: define i1 @length63_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X64-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP6]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP7]], 73389002901949112059321871464991568690
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 47
+; X64-NEXT:    [[TMP10:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = xor i128 [[TMP10]], 66716800424378146251538984255488604215
+; X64-NEXT:    [[TMP12:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-NEXT:    [[TMP13:%.*]] = or i128 [[TMP8]], [[TMP11]]
+; X64-NEXT:    [[TMP14:%.*]] = or i128 [[TMP12]], [[TMP13]]
+; X64-NEXT:    [[TMP15:%.*]] = icmp ne i128 [[TMP14]], 0
+; X64-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length63_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP6]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP7]], 73389002901949112059321871464991568690
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 47
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = xor i128 [[TMP10]], 66716800424378146251538984255488604215
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = or i128 [[TMP8]], [[TMP11]]
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = or i128 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = icmp ne i128 [[TMP14]], 0
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length63_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 22702550761799267355187145649125784605216755694630776232256222584591002841649
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length63_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 22702550761799267355187145649125784605216755694630776232256222584591002841649
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length63_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 22702550761799267355187145649125784605216755694630776232256222584591002841649
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length63_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 22702550761799267355187145649125784605216755694630776232256222584591002841649
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length63_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 22702550761799267355187145649125784605216755694630776232256222584591002841649
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length63_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 22702550761799267355187145649125784605216755694630776232256222584591002841649
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length63_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 22702550761799267355187145649125784605216755694630776232256222584591002841649
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length63_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 31
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 22702550761799267355187145649125784605216755694630776232256222584591002841649
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512-LABEL: length63_eq_const:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-AVX512-NEXT:    vmovdqu 31(%rdi), %ymm1
+; X64-AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; X64-AVX512-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vptest %ymm0, %ymm0
+; X64-AVX512-NEXT:    sete %al
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+; X64-MIC-AVX-LABEL: length63_eq_const:
+; X64-MIC-AVX:       # %bb.0:
+; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %ymm0
+; X64-MIC-AVX-NEXT:    vmovdqu 31(%rdi), %ymm1
+; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} ymm2 = [875770417,943142453,842084409,909456435,809056311,875770417,943142453,842084409]
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm1, %k0
+; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [858927408,926299444,825243960,892613426,959985462,858927408,926299444,825243960]
+; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
+; X64-MIC-AVX-NEXT:    kortestw %k0, %k1
+; X64-MIC-AVX-NEXT:    sete %al
+; X64-MIC-AVX-NEXT:    vzeroupper
+; X64-MIC-AVX-NEXT:    retq
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 63) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length64(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length64(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length64(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length64(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length64(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length64(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length64(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length64(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length64(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length64(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length64(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 64) nounwind
+  ret i32 %m
+}
+
+define i1 @length64_eq(ptr %x, ptr %y) nounwind {
+;
+; X64-LABEL: define i1 @length64_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 48
+; X64-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 48
+; X64-NEXT:    [[TMP16:%.*]] = load i128, ptr [[TMP14]], align 1
+; X64-NEXT:    [[TMP17:%.*]] = load i128, ptr [[TMP15]], align 1
+; X64-NEXT:    [[TMP18:%.*]] = xor i128 [[TMP16]], [[TMP17]]
+; X64-NEXT:    [[TMP19:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP20:%.*]] = or i128 [[TMP13]], [[TMP18]]
+; X64-NEXT:    [[TMP21:%.*]] = or i128 [[TMP19]], [[TMP20]]
+; X64-NEXT:    [[TMP22:%.*]] = icmp ne i128 [[TMP21]], 0
+; X64-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-NEXT:    ret i1 [[TMP22]]
+;
+; X64-SSE41-LABEL: define i1 @length64_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = load i128, ptr [[TMP10]], align 1
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = xor i128 [[TMP11]], [[TMP12]]
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 48
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 48
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = load i128, ptr [[TMP14]], align 1
+; X64-SSE41-NEXT:    [[TMP17:%.*]] = load i128, ptr [[TMP15]], align 1
+; X64-SSE41-NEXT:    [[TMP18:%.*]] = xor i128 [[TMP16]], [[TMP17]]
+; X64-SSE41-NEXT:    [[TMP19:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-SSE41-NEXT:    [[TMP20:%.*]] = or i128 [[TMP13]], [[TMP18]]
+; X64-SSE41-NEXT:    [[TMP21:%.*]] = or i128 [[TMP19]], [[TMP20]]
+; X64-SSE41-NEXT:    [[TMP22:%.*]] = icmp ne i128 [[TMP21]], 0
+; X64-SSE41-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-SSE41-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX1-LABEL: define i1 @length64_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX2-LABEL: define i1 @length64_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length64_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512BW-LABEL: define i1 @length64_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = icmp ne i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length64_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512F-LABEL: define i1 @length64_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP3]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length64_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP10]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length64_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX512-LABEL: length64_eq:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vmovdqu64 (%rdi), %zmm0
+; X64-AVX512-NEXT:    vpcmpneqd (%rsi), %zmm0, %k0
+; X64-AVX512-NEXT:    kortestw %k0, %k0
+; X64-AVX512-NEXT:    setne %al
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length64_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length64_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length64_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length64_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length64_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length64_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length64_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length64_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length64_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length64_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length64_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length64_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length64_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length64_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length64_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length64_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length64_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length64_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length64_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length64_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_eq_const(ptr %X) nounwind {
+;
+; X64-LABEL: define i1 @length64_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X64-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP6]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP7]], 73389002901949112059321871464991568690
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 48
+; X64-NEXT:    [[TMP10:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = xor i128 [[TMP10]], 68051240286688436651889234231545575736
+; X64-NEXT:    [[TMP12:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-NEXT:    [[TMP13:%.*]] = or i128 [[TMP8]], [[TMP11]]
+; X64-NEXT:    [[TMP14:%.*]] = or i128 [[TMP12]], [[TMP13]]
+; X64-NEXT:    [[TMP15:%.*]] = icmp ne i128 [[TMP14]], 0
+; X64-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length64_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-SSE41-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-SSE41-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-SSE41-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X64-SSE41-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP6]], align 1
+; X64-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP7]], 73389002901949112059321871464991568690
+; X64-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 48
+; X64-SSE41-NEXT:    [[TMP10:%.*]] = load i128, ptr [[TMP9]], align 1
+; X64-SSE41-NEXT:    [[TMP11:%.*]] = xor i128 [[TMP10]], 68051240286688436651889234231545575736
+; X64-SSE41-NEXT:    [[TMP12:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-SSE41-NEXT:    [[TMP13:%.*]] = or i128 [[TMP8]], [[TMP11]]
+; X64-SSE41-NEXT:    [[TMP14:%.*]] = or i128 [[TMP12]], [[TMP13]]
+; X64-SSE41-NEXT:    [[TMP15:%.*]] = icmp ne i128 [[TMP14]], 0
+; X64-SSE41-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length64_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length64_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length64_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length64_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = icmp ne i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length64_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length64_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length64_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length64_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = icmp ne i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512-LABEL: length64_eq_const:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vmovdqu64 (%rdi), %zmm0
+; X64-AVX512-NEXT:    vpcmpneqd .L.str(%rip), %zmm0, %k0
+; X64-AVX512-NEXT:    kortestw %k0, %k0
+; X64-AVX512-NEXT:    sete %al
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 64) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length96(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length96(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length96(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length96(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length96(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length96(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length96(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length96(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length96(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length96(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length96(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 96) nounwind
+  ret i32 %m
+}
+
+define i1 @length96_eq(ptr %x, ptr %y) nounwind {
+; X64-SSE-LABEL: length96_eq:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $96, %edx
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    setne %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length96_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length96_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length96_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = or i256 [[TMP14]], [[TMP13]]
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = icmp ne i256 [[TMP15]], 0
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP16]]
+;
+; X64-AVX2-LABEL: define i1 @length96_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = or i256 [[TMP14]], [[TMP13]]
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = icmp ne i256 [[TMP15]], 0
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP16]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length96_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = or i256 [[TMP14]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = icmp ne i256 [[TMP15]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP16]]
+;
+; X64-AVX512BW-LABEL: define i1 @length96_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = zext i256 [[TMP6]] to i512
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = zext i256 [[TMP7]] to i512
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = xor i512 [[TMP8]], [[TMP9]]
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = or i512 [[TMP3]], [[TMP10]]
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = icmp ne i512 [[TMP11]], 0
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length96_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = or i256 [[TMP14]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = icmp ne i256 [[TMP15]], 0
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP16]]
+;
+; X64-AVX512F-LABEL: define i1 @length96_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = zext i256 [[TMP6]] to i512
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = zext i256 [[TMP7]] to i512
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = xor i512 [[TMP8]], [[TMP9]]
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = or i512 [[TMP3]], [[TMP10]]
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i512 [[TMP11]], 0
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP12]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length96_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = or i256 [[TMP14]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = icmp ne i256 [[TMP15]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP16]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length96_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = zext i256 [[TMP6]] to i512
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = zext i256 [[TMP7]] to i512
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = xor i512 [[TMP8]], [[TMP9]]
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = or i512 [[TMP3]], [[TMP10]]
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = icmp ne i512 [[TMP11]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP12]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length96_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length96_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length96_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length96_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length96_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length96_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length96_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length96_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length96_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length96_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length96_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length96_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length96_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length96_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length96_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length96_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length96_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length96_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length96_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length96_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length96_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length96_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 96) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length96_eq_const(ptr %X) nounwind {
+; X64-SSE-LABEL: length96_eq_const:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $.L.str, %esi
+; X64-SSE-NEXT:    movl $96, %edx
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    sete %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length96_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 96) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length96_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 96) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length96_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = or i256 [[TMP9]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = icmp ne i256 [[TMP10]], 0
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = zext i1 [[TMP11]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP12]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length96_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = or i256 [[TMP9]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = icmp ne i256 [[TMP10]], 0
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = zext i1 [[TMP11]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP12]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length96_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = or i256 [[TMP9]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = icmp ne i256 [[TMP10]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = zext i1 [[TMP11]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP12]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length96_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = zext i256 [[TMP5]] to i512
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP6]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = icmp ne i512 [[TMP8]], 0
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP9]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP10]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length96_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = or i256 [[TMP9]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = icmp ne i256 [[TMP10]], 0
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = zext i1 [[TMP11]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP12]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length96_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = zext i256 [[TMP5]] to i512
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP6]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = icmp ne i512 [[TMP8]], 0
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP9]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP10]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length96_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = or i256 [[TMP9]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = icmp ne i256 [[TMP10]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = zext i1 [[TMP11]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP12]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length96_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = zext i256 [[TMP5]] to i512
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP6]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = icmp ne i512 [[TMP8]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP9]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP10]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 96) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length127(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length127(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length127(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length127(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length127(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length127(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length127(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length127(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length127(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length127(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length127(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 127) nounwind
+  ret i32 %m
+}
+
+define i1 @length127_eq(ptr %x, ptr %y) nounwind {
+; X64-SSE-LABEL: length127_eq:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $127, %edx
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    setne %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length127_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length127_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length127_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 95
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 95
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = load i256, ptr [[TMP14]], align 1
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = load i256, ptr [[TMP15]], align 1
+; X64-AVX1-NEXT:    [[TMP18:%.*]] = xor i256 [[TMP16]], [[TMP17]]
+; X64-AVX1-NEXT:    [[TMP19:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP20:%.*]] = or i256 [[TMP13]], [[TMP18]]
+; X64-AVX1-NEXT:    [[TMP21:%.*]] = or i256 [[TMP19]], [[TMP20]]
+; X64-AVX1-NEXT:    [[TMP22:%.*]] = icmp ne i256 [[TMP21]], 0
+; X64-AVX1-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX2-LABEL: define i1 @length127_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 95
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 95
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = load i256, ptr [[TMP14]], align 1
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = load i256, ptr [[TMP15]], align 1
+; X64-AVX2-NEXT:    [[TMP18:%.*]] = xor i256 [[TMP16]], [[TMP17]]
+; X64-AVX2-NEXT:    [[TMP19:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP20:%.*]] = or i256 [[TMP13]], [[TMP18]]
+; X64-AVX2-NEXT:    [[TMP21:%.*]] = or i256 [[TMP19]], [[TMP20]]
+; X64-AVX2-NEXT:    [[TMP22:%.*]] = icmp ne i256 [[TMP21]], 0
+; X64-AVX2-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length127_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 95
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 95
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = load i256, ptr [[TMP14]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = load i256, ptr [[TMP15]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP18:%.*]] = xor i256 [[TMP16]], [[TMP17]]
+; X64-AVX512BW-256-NEXT:    [[TMP19:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP20:%.*]] = or i256 [[TMP13]], [[TMP18]]
+; X64-AVX512BW-256-NEXT:    [[TMP21:%.*]] = or i256 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-256-NEXT:    [[TMP22:%.*]] = icmp ne i256 [[TMP21]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX512BW-LABEL: define i1 @length127_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 63
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 63
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = icmp ne i512 [[TMP9]], 0
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length127_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 95
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 95
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = load i256, ptr [[TMP14]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = load i256, ptr [[TMP15]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP18:%.*]] = xor i256 [[TMP16]], [[TMP17]]
+; X64-AVX512F-256-NEXT:    [[TMP19:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP20:%.*]] = or i256 [[TMP13]], [[TMP18]]
+; X64-AVX512F-256-NEXT:    [[TMP21:%.*]] = or i256 [[TMP19]], [[TMP20]]
+; X64-AVX512F-256-NEXT:    [[TMP22:%.*]] = icmp ne i256 [[TMP21]], 0
+; X64-AVX512F-256-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX512F-LABEL: define i1 @length127_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 63
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 63
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i512 [[TMP9]], 0
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP10]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length127_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 95
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 95
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = load i256, ptr [[TMP14]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = load i256, ptr [[TMP15]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP18:%.*]] = xor i256 [[TMP16]], [[TMP17]]
+; X64-MIC-AVX2-NEXT:    [[TMP19:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP20:%.*]] = or i256 [[TMP13]], [[TMP18]]
+; X64-MIC-AVX2-NEXT:    [[TMP21:%.*]] = or i256 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX2-NEXT:    [[TMP22:%.*]] = icmp ne i256 [[TMP21]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP22]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length127_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 63
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 63
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i512 [[TMP9]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP10]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length127_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length127_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length127_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length127_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length127_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length127_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length127_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length127_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length127_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length127_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length127_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length127_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length127_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length127_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length127_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length127_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length127_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length127_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length127_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length127_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length127_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length127_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 127) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length127_eq_const(ptr %X) nounwind {
+; X64-SSE-LABEL: length127_eq_const:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $.L.str, %esi
+; X64-SSE-NEXT:    movl $127, %edx
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    sete %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length127_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 127) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length127_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 127) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length127_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 95
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = xor i256 [[TMP10]], 24518896988982801982081367250212210778372643504230047123819838724519570650677
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = or i256 [[TMP8]], [[TMP11]]
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = or i256 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = icmp ne i256 [[TMP14]], 0
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length127_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 95
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = xor i256 [[TMP10]], 24518896988982801982081367250212210778372643504230047123819838724519570650677
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = or i256 [[TMP8]], [[TMP11]]
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = or i256 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = icmp ne i256 [[TMP14]], 0
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length127_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 95
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = xor i256 [[TMP10]], 24518896988982801982081367250212210778372643504230047123819838724519570650677
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = or i256 [[TMP8]], [[TMP11]]
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = or i256 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = icmp ne i256 [[TMP14]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length127_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 63
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 63), align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = icmp ne i512 [[TMP8]], 0
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP9]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP10]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length127_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 95
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = xor i256 [[TMP10]], 24518896988982801982081367250212210778372643504230047123819838724519570650677
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = or i256 [[TMP8]], [[TMP11]]
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = or i256 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = icmp ne i256 [[TMP14]], 0
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length127_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 63
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 63), align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = icmp ne i512 [[TMP8]], 0
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP9]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP10]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length127_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 95
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = xor i256 [[TMP10]], 24518896988982801982081367250212210778372643504230047123819838724519570650677
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = or i256 [[TMP8]], [[TMP11]]
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = or i256 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = icmp ne i256 [[TMP14]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length127_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 63
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 63), align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = icmp ne i512 [[TMP8]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP9]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP10]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 127) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length128(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length128(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length128(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length128(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length128(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length128(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length128(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length128(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length128(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length128(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length128(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 128) nounwind
+  ret i32 %m
+}
+
+define i1 @length128_eq(ptr %x, ptr %y) nounwind {
+; X64-SSE-LABEL: length128_eq:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $128, %edx
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    setne %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length128_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length128_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length128_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 96
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 96
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = load i256, ptr [[TMP14]], align 1
+; X64-AVX1-NEXT:    [[TMP17:%.*]] = load i256, ptr [[TMP15]], align 1
+; X64-AVX1-NEXT:    [[TMP18:%.*]] = xor i256 [[TMP16]], [[TMP17]]
+; X64-AVX1-NEXT:    [[TMP19:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP20:%.*]] = or i256 [[TMP13]], [[TMP18]]
+; X64-AVX1-NEXT:    [[TMP21:%.*]] = or i256 [[TMP19]], [[TMP20]]
+; X64-AVX1-NEXT:    [[TMP22:%.*]] = icmp ne i256 [[TMP21]], 0
+; X64-AVX1-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX2-LABEL: define i1 @length128_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 96
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 96
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = load i256, ptr [[TMP14]], align 1
+; X64-AVX2-NEXT:    [[TMP17:%.*]] = load i256, ptr [[TMP15]], align 1
+; X64-AVX2-NEXT:    [[TMP18:%.*]] = xor i256 [[TMP16]], [[TMP17]]
+; X64-AVX2-NEXT:    [[TMP19:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP20:%.*]] = or i256 [[TMP13]], [[TMP18]]
+; X64-AVX2-NEXT:    [[TMP21:%.*]] = or i256 [[TMP19]], [[TMP20]]
+; X64-AVX2-NEXT:    [[TMP22:%.*]] = icmp ne i256 [[TMP21]], 0
+; X64-AVX2-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length128_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 96
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 96
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = load i256, ptr [[TMP14]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP17:%.*]] = load i256, ptr [[TMP15]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP18:%.*]] = xor i256 [[TMP16]], [[TMP17]]
+; X64-AVX512BW-256-NEXT:    [[TMP19:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-256-NEXT:    [[TMP20:%.*]] = or i256 [[TMP13]], [[TMP18]]
+; X64-AVX512BW-256-NEXT:    [[TMP21:%.*]] = or i256 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-256-NEXT:    [[TMP22:%.*]] = icmp ne i256 [[TMP21]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX512BW-256-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX512BW-LABEL: define i1 @length128_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = icmp ne i512 [[TMP9]], 0
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length128_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 96
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 96
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = load i256, ptr [[TMP14]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP17:%.*]] = load i256, ptr [[TMP15]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP18:%.*]] = xor i256 [[TMP16]], [[TMP17]]
+; X64-AVX512F-256-NEXT:    [[TMP19:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX512F-256-NEXT:    [[TMP20:%.*]] = or i256 [[TMP13]], [[TMP18]]
+; X64-AVX512F-256-NEXT:    [[TMP21:%.*]] = or i256 [[TMP19]], [[TMP20]]
+; X64-AVX512F-256-NEXT:    [[TMP22:%.*]] = icmp ne i256 [[TMP21]], 0
+; X64-AVX512F-256-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX512F-256-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX512F-LABEL: define i1 @length128_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i512 [[TMP9]], 0
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP10]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length128_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = load i256, ptr [[TMP10]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = xor i256 [[TMP11]], [[TMP12]]
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 96
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 96
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = load i256, ptr [[TMP14]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP17:%.*]] = load i256, ptr [[TMP15]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP18:%.*]] = xor i256 [[TMP16]], [[TMP17]]
+; X64-MIC-AVX2-NEXT:    [[TMP19:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX2-NEXT:    [[TMP20:%.*]] = or i256 [[TMP13]], [[TMP18]]
+; X64-MIC-AVX2-NEXT:    [[TMP21:%.*]] = or i256 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX2-NEXT:    [[TMP22:%.*]] = icmp ne i256 [[TMP21]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-MIC-AVX2-NEXT:    ret i1 [[TMP22]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length128_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = icmp ne i512 [[TMP9]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP10]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length128_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length128_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length128_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length128_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length128_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length128_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length128_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length128_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length128_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length128_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length128_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length128_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length128_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length128_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length128_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length128_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length128_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length128_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length128_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length128_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length128_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length128_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 128) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length128_eq_const(ptr %X) nounwind {
+; X64-SSE-LABEL: length128_eq_const:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $.L.str, %esi
+; X64-SSE-NEXT:    movl $128, %edx
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    sete %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length128_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 128) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length128_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 128) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length128_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 96
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = xor i256 [[TMP10]], 24972983613442865430775334151281434151203991406697113551929636559217741018934
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = or i256 [[TMP8]], [[TMP11]]
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = or i256 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    [[TMP15:%.*]] = icmp ne i256 [[TMP14]], 0
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length128_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 96
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = xor i256 [[TMP10]], 24972983613442865430775334151281434151203991406697113551929636559217741018934
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = or i256 [[TMP8]], [[TMP11]]
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = or i256 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    [[TMP15:%.*]] = icmp ne i256 [[TMP14]], 0
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length128_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512BW-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512BW-256-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX512BW-256-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX512BW-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 96
+; X64-AVX512BW-256-NEXT:    [[TMP10:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX512BW-256-NEXT:    [[TMP11:%.*]] = xor i256 [[TMP10]], 24972983613442865430775334151281434151203991406697113551929636559217741018934
+; X64-AVX512BW-256-NEXT:    [[TMP12:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512BW-256-NEXT:    [[TMP13:%.*]] = or i256 [[TMP8]], [[TMP11]]
+; X64-AVX512BW-256-NEXT:    [[TMP14:%.*]] = or i256 [[TMP12]], [[TMP13]]
+; X64-AVX512BW-256-NEXT:    [[TMP15:%.*]] = icmp ne i256 [[TMP14]], 0
+; X64-AVX512BW-256-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length128_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = icmp ne i512 [[TMP8]], 0
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP9]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP10]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length128_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX512F-256-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX512F-256-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX512F-256-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-256-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-AVX512F-256-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 96
+; X64-AVX512F-256-NEXT:    [[TMP10:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-AVX512F-256-NEXT:    [[TMP11:%.*]] = xor i256 [[TMP10]], 24972983613442865430775334151281434151203991406697113551929636559217741018934
+; X64-AVX512F-256-NEXT:    [[TMP12:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX512F-256-NEXT:    [[TMP13:%.*]] = or i256 [[TMP8]], [[TMP11]]
+; X64-AVX512F-256-NEXT:    [[TMP14:%.*]] = or i256 [[TMP12]], [[TMP13]]
+; X64-AVX512F-256-NEXT:    [[TMP15:%.*]] = icmp ne i256 [[TMP14]], 0
+; X64-AVX512F-256-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length128_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = icmp ne i512 [[TMP8]], 0
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP9]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP10]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length128_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-MIC-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-MIC-AVX2-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-MIC-AVX2-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP6]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP7]], 24064810364522754539996825585178935186817565138301605567169177049701086016820
+; X64-MIC-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 96
+; X64-MIC-AVX2-NEXT:    [[TMP10:%.*]] = load i256, ptr [[TMP9]], align 1
+; X64-MIC-AVX2-NEXT:    [[TMP11:%.*]] = xor i256 [[TMP10]], 24972983613442865430775334151281434151203991406697113551929636559217741018934
+; X64-MIC-AVX2-NEXT:    [[TMP12:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-MIC-AVX2-NEXT:    [[TMP13:%.*]] = or i256 [[TMP8]], [[TMP11]]
+; X64-MIC-AVX2-NEXT:    [[TMP14:%.*]] = or i256 [[TMP12]], [[TMP13]]
+; X64-MIC-AVX2-NEXT:    [[TMP15:%.*]] = icmp ne i256 [[TMP14]], 0
+; X64-MIC-AVX2-NEXT:    [[TMP16:%.*]] = zext i1 [[TMP15]] to i32
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP16]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length128_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = icmp ne i512 [[TMP8]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP9]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP10]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 128) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length192(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length192(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length192(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length192(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length192(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length192(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length192(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length192(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length192(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length192(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length192(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 192) nounwind
+  ret i32 %m
+}
+
+define i1 @length192_eq(ptr %x, ptr %y) nounwind {
+; X64-SSE-LABEL: length192_eq:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $192, %edx
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    setne %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length192_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length192_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length192_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length192_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length192_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length192_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 128
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i512, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = load i512, ptr [[TMP10]], align 1
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = xor i512 [[TMP11]], [[TMP12]]
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = or i512 [[TMP14]], [[TMP13]]
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = icmp ne i512 [[TMP15]], 0
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP16]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length192_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length192_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 128
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i512, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = load i512, ptr [[TMP10]], align 1
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = xor i512 [[TMP11]], [[TMP12]]
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = or i512 [[TMP14]], [[TMP13]]
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = icmp ne i512 [[TMP15]], 0
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP16]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length192_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length192_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 128
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i512, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = load i512, ptr [[TMP10]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = xor i512 [[TMP11]], [[TMP12]]
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = or i512 [[TMP14]], [[TMP13]]
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = icmp ne i512 [[TMP15]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP16]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length192_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length192_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length192_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length192_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length192_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length192_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length192_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length192_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length192_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length192_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length192_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length192_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length192_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length192_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length192_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length192_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length192_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length192_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length192_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length192_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length192_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length192_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 192) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length192_eq_const(ptr %X) nounwind {
+; X64-SSE-LABEL: length192_eq_const:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $.L.str, %esi
+; X64-SSE-NEXT:    movl $192, %edx
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    sete %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length192_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 192) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length192_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 192) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length192_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 192) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length192_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 192) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length192_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 192) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length192_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = load i512, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 128), align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = xor i512 [[TMP9]], [[TMP10]]
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = or i512 [[TMP12]], [[TMP11]]
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = icmp ne i512 [[TMP13]], 0
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP15]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length192_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 192) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length192_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = load i512, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 128), align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = xor i512 [[TMP9]], [[TMP10]]
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = or i512 [[TMP12]], [[TMP11]]
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = icmp ne i512 [[TMP13]], 0
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP15]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length192_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 192) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length192_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = load i512, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 128), align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = xor i512 [[TMP9]], [[TMP10]]
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = or i512 [[TMP12]], [[TMP11]]
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = icmp ne i512 [[TMP13]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP15]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 192) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length255(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length255(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length255(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length255(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length255(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length255(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length255(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length255(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length255(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length255(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length255(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 255) nounwind
+  ret i32 %m
+}
+
+define i1 @length255_eq(ptr %x, ptr %y) nounwind {
+; X64-SSE-LABEL: length255_eq:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $255, %edx
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    setne %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length255_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length255_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length255_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length255_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length255_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length255_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 128
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i512, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = load i512, ptr [[TMP10]], align 1
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = xor i512 [[TMP11]], [[TMP12]]
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 191
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 191
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = load i512, ptr [[TMP14]], align 1
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = load i512, ptr [[TMP15]], align 1
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = xor i512 [[TMP16]], [[TMP17]]
+; X64-AVX512BW-NEXT:    [[TMP19:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP20:%.*]] = or i512 [[TMP13]], [[TMP18]]
+; X64-AVX512BW-NEXT:    [[TMP21:%.*]] = or i512 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-NEXT:    [[TMP22:%.*]] = icmp ne i512 [[TMP21]], 0
+; X64-AVX512BW-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length255_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length255_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 128
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i512, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = load i512, ptr [[TMP10]], align 1
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = xor i512 [[TMP11]], [[TMP12]]
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 191
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 191
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = load i512, ptr [[TMP14]], align 1
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = load i512, ptr [[TMP15]], align 1
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = xor i512 [[TMP16]], [[TMP17]]
+; X64-AVX512F-NEXT:    [[TMP19:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP20:%.*]] = or i512 [[TMP13]], [[TMP18]]
+; X64-AVX512F-NEXT:    [[TMP21:%.*]] = or i512 [[TMP19]], [[TMP20]]
+; X64-AVX512F-NEXT:    [[TMP22:%.*]] = icmp ne i512 [[TMP21]], 0
+; X64-AVX512F-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP22]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length255_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length255_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 128
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i512, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = load i512, ptr [[TMP10]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = xor i512 [[TMP11]], [[TMP12]]
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 191
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 191
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = load i512, ptr [[TMP14]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = load i512, ptr [[TMP15]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = xor i512 [[TMP16]], [[TMP17]]
+; X64-MIC-AVX512F-NEXT:    [[TMP19:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP20:%.*]] = or i512 [[TMP13]], [[TMP18]]
+; X64-MIC-AVX512F-NEXT:    [[TMP21:%.*]] = or i512 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX512F-NEXT:    [[TMP22:%.*]] = icmp ne i512 [[TMP21]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP22]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length255_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length255_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length255_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length255_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length255_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length255_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length255_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length255_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length255_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length255_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length255_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length255_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length255_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length255_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length255_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length255_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length255_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length255_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length255_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length255_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length255_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length255_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 255) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length255_eq_const(ptr %X) nounwind {
+; X64-SSE-LABEL: length255_eq_const:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $.L.str, %esi
+; X64-SSE-NEXT:    movl $255, %edx
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    sete %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length255_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 255) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length255_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 255) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length255_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 255) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length255_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 255) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length255_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 255) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length255_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = load i512, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 128), align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = xor i512 [[TMP9]], [[TMP10]]
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[X]], i64 191
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = load i512, ptr [[TMP12]], align 1
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 191), align 1
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = xor i512 [[TMP13]], [[TMP14]]
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = or i512 [[TMP11]], [[TMP15]]
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = or i512 [[TMP16]], [[TMP17]]
+; X64-AVX512BW-NEXT:    [[TMP19:%.*]] = icmp ne i512 [[TMP18]], 0
+; X64-AVX512BW-NEXT:    [[TMP20:%.*]] = zext i1 [[TMP19]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP20]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length255_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 255) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length255_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = load i512, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 128), align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = xor i512 [[TMP9]], [[TMP10]]
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[X]], i64 191
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = load i512, ptr [[TMP12]], align 1
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 191), align 1
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = xor i512 [[TMP13]], [[TMP14]]
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = or i512 [[TMP11]], [[TMP15]]
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = or i512 [[TMP16]], [[TMP17]]
+; X64-AVX512F-NEXT:    [[TMP19:%.*]] = icmp ne i512 [[TMP18]], 0
+; X64-AVX512F-NEXT:    [[TMP20:%.*]] = zext i1 [[TMP19]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP20]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length255_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 255) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length255_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = load i512, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 128), align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = xor i512 [[TMP9]], [[TMP10]]
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[X]], i64 191
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = load i512, ptr [[TMP12]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 191), align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = xor i512 [[TMP13]], [[TMP14]]
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = or i512 [[TMP11]], [[TMP15]]
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = or i512 [[TMP16]], [[TMP17]]
+; X64-MIC-AVX512F-NEXT:    [[TMP19:%.*]] = icmp ne i512 [[TMP18]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP20:%.*]] = zext i1 [[TMP19]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP20]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 255) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length256(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length256(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length256(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length256(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length256(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length256(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length256(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length256(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length256(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length256(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length256(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 256) nounwind
+  ret i32 %m
+}
+
+define i1 @length256_eq(ptr %x, ptr %y) nounwind {
+; X64-SSE-LABEL: length256_eq:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $256, %edx # imm = 0x100
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    setne %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length256_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length256_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length256_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length256_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length256_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length256_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 128
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = load i512, ptr [[TMP9]], align 1
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = load i512, ptr [[TMP10]], align 1
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = xor i512 [[TMP11]], [[TMP12]]
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 192
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 192
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = load i512, ptr [[TMP14]], align 1
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = load i512, ptr [[TMP15]], align 1
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = xor i512 [[TMP16]], [[TMP17]]
+; X64-AVX512BW-NEXT:    [[TMP19:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-AVX512BW-NEXT:    [[TMP20:%.*]] = or i512 [[TMP13]], [[TMP18]]
+; X64-AVX512BW-NEXT:    [[TMP21:%.*]] = or i512 [[TMP19]], [[TMP20]]
+; X64-AVX512BW-NEXT:    [[TMP22:%.*]] = icmp ne i512 [[TMP21]], 0
+; X64-AVX512BW-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX512BW-NEXT:    ret i1 [[TMP22]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length256_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length256_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 128
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = load i512, ptr [[TMP9]], align 1
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = load i512, ptr [[TMP10]], align 1
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = xor i512 [[TMP11]], [[TMP12]]
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 192
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 192
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = load i512, ptr [[TMP14]], align 1
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = load i512, ptr [[TMP15]], align 1
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = xor i512 [[TMP16]], [[TMP17]]
+; X64-AVX512F-NEXT:    [[TMP19:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-AVX512F-NEXT:    [[TMP20:%.*]] = or i512 [[TMP13]], [[TMP18]]
+; X64-AVX512F-NEXT:    [[TMP21:%.*]] = or i512 [[TMP19]], [[TMP20]]
+; X64-AVX512F-NEXT:    [[TMP22:%.*]] = icmp ne i512 [[TMP21]], 0
+; X64-AVX512F-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-AVX512F-NEXT:    ret i1 [[TMP22]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length256_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length256_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr [[Y]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = load i512, ptr [[TMP5]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = xor i512 [[TMP6]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[Y]], i64 128
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = load i512, ptr [[TMP9]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = load i512, ptr [[TMP10]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = xor i512 [[TMP11]], [[TMP12]]
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i64 192
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Y]], i64 192
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = load i512, ptr [[TMP14]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = load i512, ptr [[TMP15]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = xor i512 [[TMP16]], [[TMP17]]
+; X64-MIC-AVX512F-NEXT:    [[TMP19:%.*]] = or i512 [[TMP3]], [[TMP8]]
+; X64-MIC-AVX512F-NEXT:    [[TMP20:%.*]] = or i512 [[TMP13]], [[TMP18]]
+; X64-MIC-AVX512F-NEXT:    [[TMP21:%.*]] = or i512 [[TMP19]], [[TMP20]]
+; X64-MIC-AVX512F-NEXT:    [[TMP22:%.*]] = icmp ne i512 [[TMP21]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP23:%.*]] = zext i1 [[TMP22]] to i32
+; X64-MIC-AVX512F-NEXT:    ret i1 [[TMP22]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length256_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length256_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length256_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length256_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length256_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length256_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length256_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length256_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length256_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length256_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length256_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length256_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length256_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length256_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length256_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length256_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length256_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length256_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length256_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length256_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length256_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length256_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 256) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length256_eq_const(ptr %X) nounwind {
+; X64-SSE-LABEL: length256_eq_const:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movl $.L.str, %esi
+; X64-SSE-NEXT:    movl $256, %edx # imm = 0x100
+; X64-SSE-NEXT:    callq memcmp
+; X64-SSE-NEXT:    testl %eax, %eax
+; X64-SSE-NEXT:    sete %al
+; X64-SSE-NEXT:    popq %rcx
+; X64-SSE-NEXT:    retq
+;
+;
+; X64-LABEL: define i1 @length256_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 256) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length256_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 256) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length256_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 256) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length256_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 256) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length256_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 256) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length256_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512BW-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512BW-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512BW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512BW-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512BW-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-AVX512BW-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-AVX512BW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512BW-NEXT:    [[TMP9:%.*]] = load i512, ptr [[TMP8]], align 1
+; X64-AVX512BW-NEXT:    [[TMP10:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 128), align 1
+; X64-AVX512BW-NEXT:    [[TMP11:%.*]] = xor i512 [[TMP9]], [[TMP10]]
+; X64-AVX512BW-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[X]], i64 192
+; X64-AVX512BW-NEXT:    [[TMP13:%.*]] = load i512, ptr [[TMP12]], align 1
+; X64-AVX512BW-NEXT:    [[TMP14:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 192), align 1
+; X64-AVX512BW-NEXT:    [[TMP15:%.*]] = xor i512 [[TMP13]], [[TMP14]]
+; X64-AVX512BW-NEXT:    [[TMP16:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512BW-NEXT:    [[TMP17:%.*]] = or i512 [[TMP11]], [[TMP15]]
+; X64-AVX512BW-NEXT:    [[TMP18:%.*]] = or i512 [[TMP16]], [[TMP17]]
+; X64-AVX512BW-NEXT:    [[TMP19:%.*]] = icmp ne i512 [[TMP18]], 0
+; X64-AVX512BW-NEXT:    [[TMP20:%.*]] = zext i1 [[TMP19]] to i32
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP20]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length256_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 256) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length256_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-AVX512F-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-AVX512F-NEXT:    [[TMP9:%.*]] = load i512, ptr [[TMP8]], align 1
+; X64-AVX512F-NEXT:    [[TMP10:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 128), align 1
+; X64-AVX512F-NEXT:    [[TMP11:%.*]] = xor i512 [[TMP9]], [[TMP10]]
+; X64-AVX512F-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[X]], i64 192
+; X64-AVX512F-NEXT:    [[TMP13:%.*]] = load i512, ptr [[TMP12]], align 1
+; X64-AVX512F-NEXT:    [[TMP14:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 192), align 1
+; X64-AVX512F-NEXT:    [[TMP15:%.*]] = xor i512 [[TMP13]], [[TMP14]]
+; X64-AVX512F-NEXT:    [[TMP16:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-AVX512F-NEXT:    [[TMP17:%.*]] = or i512 [[TMP11]], [[TMP15]]
+; X64-AVX512F-NEXT:    [[TMP18:%.*]] = or i512 [[TMP16]], [[TMP17]]
+; X64-AVX512F-NEXT:    [[TMP19:%.*]] = icmp ne i512 [[TMP18]], 0
+; X64-AVX512F-NEXT:    [[TMP20:%.*]] = zext i1 [[TMP19]] to i32
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP20]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length256_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 256) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length256_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[TMP1:%.*]] = load i512, ptr [[X]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP2:%.*]] = load i512, ptr @.str, align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP3:%.*]] = xor i512 [[TMP1]], [[TMP2]]
+; X64-MIC-AVX512F-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 64
+; X64-MIC-AVX512F-NEXT:    [[TMP5:%.*]] = load i512, ptr [[TMP4]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP6:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 64), align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP7:%.*]] = xor i512 [[TMP5]], [[TMP6]]
+; X64-MIC-AVX512F-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 128
+; X64-MIC-AVX512F-NEXT:    [[TMP9:%.*]] = load i512, ptr [[TMP8]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP10:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 128), align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP11:%.*]] = xor i512 [[TMP9]], [[TMP10]]
+; X64-MIC-AVX512F-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[X]], i64 192
+; X64-MIC-AVX512F-NEXT:    [[TMP13:%.*]] = load i512, ptr [[TMP12]], align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP14:%.*]] = load i512, ptr getelementptr (i8, ptr @.str, i64 192), align 1
+; X64-MIC-AVX512F-NEXT:    [[TMP15:%.*]] = xor i512 [[TMP13]], [[TMP14]]
+; X64-MIC-AVX512F-NEXT:    [[TMP16:%.*]] = or i512 [[TMP3]], [[TMP7]]
+; X64-MIC-AVX512F-NEXT:    [[TMP17:%.*]] = or i512 [[TMP11]], [[TMP15]]
+; X64-MIC-AVX512F-NEXT:    [[TMP18:%.*]] = or i512 [[TMP16]], [[TMP17]]
+; X64-MIC-AVX512F-NEXT:    [[TMP19:%.*]] = icmp ne i512 [[TMP18]], 0
+; X64-MIC-AVX512F-NEXT:    [[TMP20:%.*]] = zext i1 [[TMP19]] to i32
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP20]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 256) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length384(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length384(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length384(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length384(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length384(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length384(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length384(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length384(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length384(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length384(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length384(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 384) nounwind
+  ret i32 %m
+}
+
+define i1 @length384_eq(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length384_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length384_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length384_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length384_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length384_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length384_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length384_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length384_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length384_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length384_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length384_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length384_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length384_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length384_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length384_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length384_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length384_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length384_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length384_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length384_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length384_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length384_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length384_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length384_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length384_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length384_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length384_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length384_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length384_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length384_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length384_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length384_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 384) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length384_eq_const(ptr %X) nounwind {
+; X64-LABEL: define i1 @length384_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 384) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length384_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 384) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length384_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 384) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length384_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 384) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length384_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 384) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length384_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 384) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length384_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 384) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length384_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 384) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length384_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 384) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length384_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 384) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 384) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length511(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length511(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length511(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length511(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length511(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length511(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length511(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length511(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length511(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length511(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length511(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 511) nounwind
+  ret i32 %m
+}
+
+define i1 @length511_eq(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length511_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length511_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length511_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length511_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length511_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length511_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length511_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length511_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length511_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length511_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length511_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length511_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length511_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length511_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length511_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length511_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length511_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length511_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length511_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length511_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length511_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length511_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length511_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length511_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length511_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length511_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length511_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length511_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length511_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length511_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length511_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length511_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 511) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length511_eq_const(ptr %X) nounwind {
+; X64-LABEL: define i1 @length511_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 511) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length511_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 511) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length511_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 511) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length511_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 511) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length511_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 511) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length511_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 511) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length511_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 511) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length511_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 511) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length511_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 511) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length511_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 511) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 511) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length512(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @length512(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @length512(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length512(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length512(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @length512(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @length512(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @length512(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @length512(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @length512(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @length512(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 512) nounwind
+  ret i32 %m
+}
+
+define i1 @length512_eq(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length512_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length512_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length512_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length512_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length512_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length512_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length512_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length512_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length512_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length512_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length512_lt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length512_lt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length512_lt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length512_lt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length512_lt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length512_lt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length512_lt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length512_lt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length512_lt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length512_lt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length512_lt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length512_gt(ptr %x, ptr %y) nounwind {
+; X64-LABEL: define i1 @length512_gt(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-SSE41-LABEL: define i1 @length512_gt(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-SSE41-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length512_gt(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length512_gt(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length512_gt(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512BW-LABEL: define i1 @length512_gt(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length512_gt(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX512F-LABEL: define i1 @length512_gt(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length512_gt(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[CMP]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length512_gt(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 512) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[CMP]]
+;
+
+
+
+
+
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length512_eq_const(ptr %X) nounwind {
+; X64-LABEL: define i1 @length512_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 512) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @length512_eq_const(
+; X64-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 512) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length512_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 512) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length512_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 512) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @length512_eq_const(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 512) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @length512_eq_const(
+; X64-AVX512BW-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 512) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @length512_eq_const(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 512) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @length512_eq_const(
+; X64-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 512) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @length512_eq_const(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 512) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @length512_eq_const(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 512) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 512) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; This checks that we do not do stupid things with huge sizes.
+define i32 @huge_length(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i32 @huge_length(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @huge_length(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @huge_length(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @huge_length(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @huge_length(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @huge_length(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @huge_length(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @huge_length(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @huge_length(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @huge_length(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9223372036854775807) nounwind
+  ret i32 %m
+}
+
+define i1 @huge_length_eq(ptr %X, ptr %Y) nounwind {
+; X64-LABEL: define i1 @huge_length_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @huge_length_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @huge_length_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @huge_length_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @huge_length_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @huge_length_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @huge_length_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @huge_length_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @huge_length_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @huge_length_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9223372036854775807) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; This checks non-constant sizes.
+define i32 @nonconst_length(ptr %X, ptr %Y, i64 %size) nounwind {
+; X64-LABEL: define i32 @nonconst_length(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-SSE41-LABEL: define i32 @nonconst_length(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-SSE41-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @nonconst_length(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @nonconst_length(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-256-LABEL: define i32 @nonconst_length(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512BW-LABEL: define i32 @nonconst_length(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-256-LABEL: define i32 @nonconst_length(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    ret i32 [[M]]
+;
+; X64-AVX512F-LABEL: define i32 @nonconst_length(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX512F-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX2-LABEL: define i32 @nonconst_length(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    ret i32 [[M]]
+;
+; X64-MIC-AVX512F-LABEL: define i32 @nonconst_length(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    ret i32 [[M]]
+;
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 %size) nounwind
+  ret i32 %m
+}
+
+define i1 @nonconst_length_eq(ptr %X, ptr %Y, i64 %size) nounwind {
+; X64-LABEL: define i1 @nonconst_length_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-SSE41-LABEL: define i1 @nonconst_length_eq(
+; X64-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-SSE41-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @nonconst_length_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @nonconst_length_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-256-LABEL: define i1 @nonconst_length_eq(
+; X64-AVX512BW-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX512BW-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512BW-LABEL: define i1 @nonconst_length_eq(
+; X64-AVX512BW-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX512BW-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX512BW-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512BW-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-256-LABEL: define i1 @nonconst_length_eq(
+; X64-AVX512F-256-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-256-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX512F-256-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-256-NEXT:    ret i1 [[C]]
+;
+; X64-AVX512F-LABEL: define i1 @nonconst_length_eq(
+; X64-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX512F-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX2-LABEL: define i1 @nonconst_length_eq(
+; X64-MIC-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-MIC-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX2-NEXT:    ret i1 [[C]]
+;
+; X64-MIC-AVX512F-LABEL: define i1 @nonconst_length_eq(
+; X64-MIC-AVX512F-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; X64-MIC-AVX512F-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; X64-MIC-AVX512F-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-MIC-AVX512F-NEXT:    ret i1 [[C]]
+;
+
+
+
+
+
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 %size) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-nobuiltin.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-nobuiltin.ll
new file mode 100644
index 00000000000000..1ad91adb9e533e
--- /dev/null
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-nobuiltin.ll
@@ -0,0 +1,248 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -passes=expand-memcmp -passes=expand-memcmp -memcmp-num-loads-per-block=1 -mtriple=x86_64-unknown-unknown         < %s | FileCheck %s --check-prefix=X64_1LD
+; RUN: opt -S -passes=expand-memcmp -memcmp-num-loads-per-block=2 -mtriple=x86_64-unknown-unknown         < %s | FileCheck %s  --check-prefix=X64_2LD
+
+
+declare signext i32 @memcmp(ptr %src1, ptr %src2, i64 %size)
+
+; Zero-length comparisons should be optimized away.
+define i32 @f1(ptr %src1, ptr %src2) {
+; X64-LABEL: define i32 @f1(
+; X64-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) {
+; X64-NEXT:    [[RES:%.*]] = call i32 @memcmp(ptr [[SRC1]], ptr [[SRC2]], i64 0) #[[ATTR0:[0-9]+]]
+; X64-NEXT:    ret i32 [[RES]]
+;
+; X64_1LD-LABEL: define i32 @f1(
+; X64_1LD-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) {
+; X64_1LD-NEXT:    [[RES:%.*]] = call i32 @memcmp(ptr [[SRC1]], ptr [[SRC2]], i64 0) #[[ATTR0:[0-9]+]]
+; X64_1LD-NEXT:    ret i32 [[RES]]
+;
+; X64_2LD-LABEL: define i32 @f1(
+; X64_2LD-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) {
+; X64_2LD-NEXT:    [[RES:%.*]] = call i32 @memcmp(ptr [[SRC1]], ptr [[SRC2]], i64 0) #[[ATTR0:[0-9]+]]
+; X64_2LD-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @memcmp(ptr %src1, ptr %src2, i64 0) nobuiltin
+  ret i32 %res
+}
+
+; Check a case where the result is used as an integer.
+define i32 @f2(ptr %src1, ptr %src2) {
+; X64-LABEL: define i32 @f2(
+; X64-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) {
+; X64-NEXT:    [[RES:%.*]] = call i32 @memcmp(ptr [[SRC1]], ptr [[SRC2]], i64 2) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[RES]]
+;
+; X64_1LD-LABEL: define i32 @f2(
+; X64_1LD-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) {
+; X64_1LD-NEXT:    [[RES:%.*]] = call i32 @memcmp(ptr [[SRC1]], ptr [[SRC2]], i64 2) #[[ATTR0]]
+; X64_1LD-NEXT:    ret i32 [[RES]]
+;
+; X64_2LD-LABEL: define i32 @f2(
+; X64_2LD-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]]) {
+; X64_2LD-NEXT:    [[RES:%.*]] = call i32 @memcmp(ptr [[SRC1]], ptr [[SRC2]], i64 2) #[[ATTR0]]
+; X64_2LD-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @memcmp(ptr %src1, ptr %src2, i64 2) nobuiltin
+  ret i32 %res
+}
+
+; Check a case where the result is tested for equality.
+define void @f3(ptr %src1, ptr %src2, ptr %dest) {
+; X64-LABEL: define void @f3(
+; X64-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]], ptr [[DEST:%.*]]) {
+; X64-NEXT:    [[RES:%.*]] = call i32 @memcmp(ptr [[SRC1]], ptr [[SRC2]], i64 3) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[RES]], 0
+; X64-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[STORE:%.*]]
+; X64:       store:
+; X64-NEXT:    store i32 0, ptr [[DEST]], align 4
+; X64-NEXT:    br label [[EXIT]]
+; X64:       exit:
+; X64-NEXT:    ret void
+;
+; X64_1LD-LABEL: define void @f3(
+; X64_1LD-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]], ptr [[DEST:%.*]]) {
+; X64_1LD-NEXT:    [[RES:%.*]] = call i32 @memcmp(ptr [[SRC1]], ptr [[SRC2]], i64 3) #[[ATTR0]]
+; X64_1LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[RES]], 0
+; X64_1LD-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[STORE:%.*]]
+; X64_1LD:       store:
+; X64_1LD-NEXT:    store i32 0, ptr [[DEST]], align 4
+; X64_1LD-NEXT:    br label [[EXIT]]
+; X64_1LD:       exit:
+; X64_1LD-NEXT:    ret void
+;
+; X64_2LD-LABEL: define void @f3(
+; X64_2LD-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]], ptr [[DEST:%.*]]) {
+; X64_2LD-NEXT:    [[RES:%.*]] = call i32 @memcmp(ptr [[SRC1]], ptr [[SRC2]], i64 3) #[[ATTR0]]
+; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[RES]], 0
+; X64_2LD-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[STORE:%.*]]
+; X64_2LD:       store:
+; X64_2LD-NEXT:    store i32 0, ptr [[DEST]], align 4
+; X64_2LD-NEXT:    br label [[EXIT]]
+; X64_2LD:       exit:
+; X64_2LD-NEXT:    ret void
+;
+  %res = call i32 @memcmp(ptr %src1, ptr %src2, i64 3) nobuiltin
+  %cmp = icmp eq i32 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 0, ptr %dest
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check a case where the result is tested for inequality.
+define void @f4(ptr %src1, ptr %src2, ptr %dest) {
+; X64-LABEL: define void @f4(
+; X64-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]], ptr [[DEST:%.*]]) {
+; X64-NEXT:  entry:
+; X64-NEXT:    [[RES:%.*]] = call i32 @memcmp(ptr [[SRC1]], ptr [[SRC2]], i64 4) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[RES]], 0
+; X64-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[STORE:%.*]]
+; X64:       store:
+; X64-NEXT:    store i32 0, ptr [[DEST]], align 4
+; X64-NEXT:    br label [[EXIT]]
+; X64:       exit:
+; X64-NEXT:    ret void
+;
+; X64_1LD-LABEL: define void @f4(
+; X64_1LD-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]], ptr [[DEST:%.*]]) {
+; X64_1LD-NEXT:  entry:
+; X64_1LD-NEXT:    [[RES:%.*]] = call i32 @memcmp(ptr [[SRC1]], ptr [[SRC2]], i64 4) #[[ATTR0]]
+; X64_1LD-NEXT:    [[CMP:%.*]] = icmp ne i32 [[RES]], 0
+; X64_1LD-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[STORE:%.*]]
+; X64_1LD:       store:
+; X64_1LD-NEXT:    store i32 0, ptr [[DEST]], align 4
+; X64_1LD-NEXT:    br label [[EXIT]]
+; X64_1LD:       exit:
+; X64_1LD-NEXT:    ret void
+;
+; X64_2LD-LABEL: define void @f4(
+; X64_2LD-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]], ptr [[DEST:%.*]]) {
+; X64_2LD-NEXT:  entry:
+; X64_2LD-NEXT:    [[RES:%.*]] = call i32 @memcmp(ptr [[SRC1]], ptr [[SRC2]], i64 4) #[[ATTR0]]
+; X64_2LD-NEXT:    [[CMP:%.*]] = icmp ne i32 [[RES]], 0
+; X64_2LD-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[STORE:%.*]]
+; X64_2LD:       store:
+; X64_2LD-NEXT:    store i32 0, ptr [[DEST]], align 4
+; X64_2LD-NEXT:    br label [[EXIT]]
+; X64_2LD:       exit:
+; X64_2LD-NEXT:    ret void
+;
+entry:
+  %res = call i32 @memcmp(ptr %src1, ptr %src2, i64 4) nobuiltin
+  %cmp = icmp ne i32 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 0, ptr %dest
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check a case where the result is tested via slt.
+define void @f5(ptr %src1, ptr %src2, ptr %dest) {
+; X64-LABEL: define void @f5(
+; X64-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]], ptr [[DEST:%.*]]) {
+; X64-NEXT:  entry:
+; X64-NEXT:    [[RES:%.*]] = call i32 @memcmp(ptr [[SRC1]], ptr [[SRC2]], i64 5) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[RES]], 0
+; X64-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[STORE:%.*]]
+; X64:       store:
+; X64-NEXT:    store i32 0, ptr [[DEST]], align 4
+; X64-NEXT:    br label [[EXIT]]
+; X64:       exit:
+; X64-NEXT:    ret void
+;
+; X64_1LD-LABEL: define void @f5(
+; X64_1LD-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]], ptr [[DEST:%.*]]) {
+; X64_1LD-NEXT:  entry:
+; X64_1LD-NEXT:    [[RES:%.*]] = call i32 @memcmp(ptr [[SRC1]], ptr [[SRC2]], i64 5) #[[ATTR0]]
+; X64_1LD-NEXT:    [[CMP:%.*]] = icmp slt i32 [[RES]], 0
+; X64_1LD-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[STORE:%.*]]
+; X64_1LD:       store:
+; X64_1LD-NEXT:    store i32 0, ptr [[DEST]], align 4
+; X64_1LD-NEXT:    br label [[EXIT]]
+; X64_1LD:       exit:
+; X64_1LD-NEXT:    ret void
+;
+; X64_2LD-LABEL: define void @f5(
+; X64_2LD-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]], ptr [[DEST:%.*]]) {
+; X64_2LD-NEXT:  entry:
+; X64_2LD-NEXT:    [[RES:%.*]] = call i32 @memcmp(ptr [[SRC1]], ptr [[SRC2]], i64 5) #[[ATTR0]]
+; X64_2LD-NEXT:    [[CMP:%.*]] = icmp slt i32 [[RES]], 0
+; X64_2LD-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[STORE:%.*]]
+; X64_2LD:       store:
+; X64_2LD-NEXT:    store i32 0, ptr [[DEST]], align 4
+; X64_2LD-NEXT:    br label [[EXIT]]
+; X64_2LD:       exit:
+; X64_2LD-NEXT:    ret void
+;
+entry:
+  %res = call i32 @memcmp(ptr %src1, ptr %src2, i64 5) nobuiltin
+  %cmp = icmp slt i32 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 0, ptr %dest
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Check a case where the result is tested for sgt.
+define void @f6(ptr %src1, ptr %src2, ptr %dest) {
+; X64-LABEL: define void @f6(
+; X64-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]], ptr [[DEST:%.*]]) {
+; X64-NEXT:  entry:
+; X64-NEXT:    [[RES:%.*]] = call i32 @memcmp(ptr [[SRC1]], ptr [[SRC2]], i64 6) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[RES]], 0
+; X64-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[STORE:%.*]]
+; X64:       store:
+; X64-NEXT:    store i32 0, ptr [[DEST]], align 4
+; X64-NEXT:    br label [[EXIT]]
+; X64:       exit:
+; X64-NEXT:    ret void
+;
+; X64_1LD-LABEL: define void @f6(
+; X64_1LD-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]], ptr [[DEST:%.*]]) {
+; X64_1LD-NEXT:  entry:
+; X64_1LD-NEXT:    [[RES:%.*]] = call i32 @memcmp(ptr [[SRC1]], ptr [[SRC2]], i64 6) #[[ATTR0]]
+; X64_1LD-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[RES]], 0
+; X64_1LD-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[STORE:%.*]]
+; X64_1LD:       store:
+; X64_1LD-NEXT:    store i32 0, ptr [[DEST]], align 4
+; X64_1LD-NEXT:    br label [[EXIT]]
+; X64_1LD:       exit:
+; X64_1LD-NEXT:    ret void
+;
+; X64_2LD-LABEL: define void @f6(
+; X64_2LD-SAME: ptr [[SRC1:%.*]], ptr [[SRC2:%.*]], ptr [[DEST:%.*]]) {
+; X64_2LD-NEXT:  entry:
+; X64_2LD-NEXT:    [[RES:%.*]] = call i32 @memcmp(ptr [[SRC1]], ptr [[SRC2]], i64 6) #[[ATTR0]]
+; X64_2LD-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[RES]], 0
+; X64_2LD-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[STORE:%.*]]
+; X64_2LD:       store:
+; X64_2LD-NEXT:    store i32 0, ptr [[DEST]], align 4
+; X64_2LD-NEXT:    br label [[EXIT]]
+; X64_2LD:       exit:
+; X64_2LD-NEXT:    ret void
+;
+entry:
+  %res = call i32 @memcmp(ptr %src1, ptr %src2, i64 6) nobuiltin
+  %cmp = icmp sgt i32 %res, 0
+  br i1 %cmp, label %exit, label %store
+
+store:
+  store i32 0, ptr %dest
+  br label %exit
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-optsize-x32.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-optsize-x32.ll
new file mode 100644
index 00000000000000..b36c0db432820d
--- /dev/null
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-optsize-x32.ll
@@ -0,0 +1,870 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -passes=expand-memcmp -mtriple=i686-unknown-unknown -mattr=cmov < %s | FileCheck %s --check-prefix=X86
+; RUN: opt -S -passes=expand-memcmp -mtriple=i686-unknown-unknown -mattr=+sse2 < %s | FileCheck %s --check-prefix=X86-SSE2
+
+; This tests codegen time inlining/optimization of memcmp
+; rdar://6480398
+
+ at .str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1
+
+declare dso_local i32 @memcmp(ptr, ptr, i32)
+declare dso_local i32 @bcmp(ptr, ptr, i32)
+
+define i32 @length2(ptr %X, ptr %Y) nounwind optsize {
+; X86-LABEL: define i32 @length2(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    ret i32 [[TMP7]]
+;
+; X86-SSE2-LABEL: define i32 @length2(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    ret i32 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
+  ret i32 %m
+}
+
+define i1 @length2_eq(ptr %X, ptr %Y) nounwind optsize {
+; X86-LABEL: define i1 @length2_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length2_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_const(ptr %X) nounwind optsize {
+; X86-LABEL: define i1 @length2_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X86-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-NEXT:    ret i1 [[TMP2]]
+;
+; X86-SSE2-LABEL: define i1 @length2_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP2]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i32 2) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind optsize {
+; X86-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 2) #[[ATTR3:[0-9]+]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 2) #[[ATTR3:[0-9]+]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind nobuiltin
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length3(ptr %X, ptr %Y) nounwind optsize {
+; X86-LABEL: define i32 @length3(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    br label [[LOADBB:%.*]]
+; X86:       res_block:
+; X86-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86:       loadbb:
+; X86-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X86-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X86-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86:       loadbb1:
+; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    br label [[ENDBLOCK]]
+; X86:       endblock:
+; X86-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE2-LABEL: define i32 @length3(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE2:       res_block:
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE2:       loadbb:
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X86-SSE2-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86-SSE2:       loadbb1:
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    br label [[ENDBLOCK]]
+; X86-SSE2:       endblock:
+; X86-SSE2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE2-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind
+  ret i32 %m
+}
+
+define i1 @length3_eq(ptr %X, ptr %Y) nounwind optsize {
+; X86-LABEL: define i1 @length3_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X86-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X86-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X86-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X86-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X86-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-NEXT:    ret i1 [[TMP12]]
+;
+; X86-SSE2-LABEL: define i1 @length3_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP12]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length4(ptr %X, ptr %Y) nounwind optsize {
+; X86-LABEL: define i32 @length4(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X86-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X86-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X86-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X86-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X86-NEXT:    ret i32 [[TMP9]]
+;
+; X86-SSE2-LABEL: define i32 @length4(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X86-SSE2-NEXT:    ret i32 [[TMP9]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
+  ret i32 %m
+}
+
+define i1 @length4_eq(ptr %X, ptr %Y) nounwind optsize {
+; X86-LABEL: define i1 @length4_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-NEXT:    ret i1 [[TMP3]]
+;
+; X86-SSE2-LABEL: define i1 @length4_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP3]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_eq_const(ptr %X) nounwind optsize {
+; X86-LABEL: define i1 @length4_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X86-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length4_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i32 4) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length5(ptr %X, ptr %Y) nounwind optsize {
+; X86-LABEL: define i32 @length5(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    br label [[LOADBB:%.*]]
+; X86:       res_block:
+; X86-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86:       loadbb:
+; X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86:       loadbb1:
+; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    br label [[ENDBLOCK]]
+; X86:       endblock:
+; X86-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE2-LABEL: define i32 @length5(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE2:       res_block:
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE2:       loadbb:
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86-SSE2:       loadbb1:
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    br label [[ENDBLOCK]]
+; X86-SSE2:       endblock:
+; X86-SSE2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE2-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
+  ret i32 %m
+}
+
+define i1 @length5_eq(ptr %X, ptr %Y) nounwind optsize {
+; X86-LABEL: define i1 @length5_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X86-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X86-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X86-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X86-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X86-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-NEXT:    ret i1 [[TMP12]]
+;
+; X86-SSE2-LABEL: define i1 @length5_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP12]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length8(ptr %X, ptr %Y) nounwind optsize {
+; X86-LABEL: define i32 @length8(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    br label [[LOADBB:%.*]]
+; X86:       res_block:
+; X86-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86:       loadbb:
+; X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86:       loadbb1:
+; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86:       endblock:
+; X86-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE2-LABEL: define i32 @length8(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE2:       res_block:
+; X86-SSE2-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-SSE2-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE2:       loadbb:
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE2:       loadbb1:
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE2-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE2:       endblock:
+; X86-SSE2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE2-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 8) nounwind
+  ret i32 %m
+}
+
+define i1 @length8_eq(ptr %X, ptr %Y) nounwind optsize {
+; X86-LABEL: define i1 @length8_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X86-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length8_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 8) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length8_eq_const(ptr %X) nounwind optsize {
+; X86-LABEL: define i1 @length8_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = xor i32 [[TMP1]], 858927408
+; X86-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 1
+; X86-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP4]], 926299444
+; X86-NEXT:    [[TMP6:%.*]] = or i32 [[TMP2]], [[TMP5]]
+; X86-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[TMP6]], 0
+; X86-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-NEXT:    ret i1 [[TMP7]]
+;
+; X86-SSE2-LABEL: define i1 @length8_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = xor i32 [[TMP1]], 858927408
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 1
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP4]], 926299444
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = or i32 [[TMP2]], [[TMP5]]
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[TMP6]], 0
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 8) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length12_eq(ptr %X, ptr %Y) nounwind optsize {
+; X86-LABEL: define i1 @length12_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 12) #[[ATTR4:[0-9]+]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length12_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 12) #[[ATTR4:[0-9]+]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 12) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length12(ptr %X, ptr %Y) nounwind optsize {
+; X86-LABEL: define i32 @length12(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 12) #[[ATTR4]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length12(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 12) #[[ATTR4]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 12) nounwind
+  ret i32 %m
+}
+
+; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
+
+define i32 @length16(ptr %X, ptr %Y) nounwind optsize {
+; X86-LABEL: define i32 @length16(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR4]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length16(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR4]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 16) nounwind
+  ret i32 %m
+}
+
+define i1 @length16_eq(ptr %x, ptr %y) nounwind optsize {
+; X86-NOSSE-LABEL: length16_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $16
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length16_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR4]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length16_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP3]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 16) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_eq_const(ptr %X) nounwind optsize {
+; X86-NOSSE-LABEL: length16_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $16
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length16_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 16) #[[ATTR4]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length16_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 16) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
+
+define i32 @length24(ptr %X, ptr %Y) nounwind optsize {
+; X86-LABEL: define i32 @length24(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR4]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length24(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR4]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 24) nounwind
+  ret i32 %m
+}
+
+define i1 @length24_eq(ptr %x, ptr %y) nounwind optsize {
+; X86-NOSSE-LABEL: length24_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $24
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length24_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR4]]
+; X86-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length24_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 24) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_eq_const(ptr %X) nounwind optsize {
+; X86-NOSSE-LABEL: length24_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $24
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length24_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 24) #[[ATTR4]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length24_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 68051240286688436651889234231545575736
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 24) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length32(ptr %X, ptr %Y) nounwind optsize {
+; X86-LABEL: define i32 @length32(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR4]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length32(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR4]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 32) nounwind
+  ret i32 %m
+}
+
+; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
+
+define i1 @length32_eq(ptr %x, ptr %y) nounwind optsize {
+; X86-NOSSE-LABEL: length32_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $32
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length32_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR4]]
+; X86-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length32_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_const(ptr %X) nounwind optsize {
+; X86-NOSSE-LABEL: length32_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $32
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length32_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 32) #[[ATTR4]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length32_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 32) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length64(ptr %X, ptr %Y) nounwind optsize {
+; X86-LABEL: define i32 @length64(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR4]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length64(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR4]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 64) nounwind
+  ret i32 %m
+}
+
+define i1 @length64_eq(ptr %x, ptr %y) nounwind optsize {
+; X86-LABEL: define i1 @length64_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR4]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length64_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR4]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 64) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_eq_const(ptr %X) nounwind optsize {
+; X86-LABEL: define i1 @length64_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 64) #[[ATTR4]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length64_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 64) #[[ATTR4]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 64) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @bcmp_length2(ptr %X, ptr %Y) nounwind optsize {
+; X86-LABEL: define i32 @bcmp_length2(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-NEXT:    ret i32 [[TMP4]]
+;
+; X86-SSE2-LABEL: define i32 @bcmp_length2(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE2-NEXT:    ret i32 [[TMP4]]
+;
+  %m = tail call i32 @bcmp(ptr %X, ptr %Y, i32 2) nounwind
+  ret i32 %m
+}
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-optsize.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-optsize.ll
new file mode 100644
index 00000000000000..cb6c5e6da1c790
--- /dev/null
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-optsize.ll
@@ -0,0 +1,1414 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -passes=expand-memcmp -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefix=X64
+; RUN: opt -S -passes=expand-memcmp -mtriple=x86_64-unknown-unknown -mattr=avx < %s | FileCheck %s --check-prefix=X64-AVX1
+; RUN: opt -S -passes=expand-memcmp -mtriple=x86_64-unknown-unknown -mattr=avx2 < %s | FileCheck %s --check-prefix=X64-AVX2
+
+; This tests codegen time inlining/optimization of memcmp
+; rdar://6480398
+
+ at .str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1
+
+declare dso_local i32 @memcmp(ptr, ptr, i64)
+declare dso_local i32 @bcmp(ptr, ptr, i64)
+
+define i32 @length2(ptr %X, ptr %Y) nounwind optsize {
+; X64-LABEL: define i32 @length2(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    ret i32 [[TMP7]]
+;
+; X64-AVX1-LABEL: define i32 @length2(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    ret i32 [[TMP7]]
+;
+; X64-AVX2-LABEL: define i32 @length2(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    ret i32 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+  ret i32 %m
+}
+
+define i1 @length2_eq(ptr %X, ptr %Y) nounwind optsize {
+; X64-LABEL: define i1 @length2_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length2_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length2_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_const(ptr %X) nounwind optsize {
+; X64-LABEL: define i1 @length2_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX1-LABEL: define i1 @length2_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX2-LABEL: define i1 @length2_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP2]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind optsize {
+; X64-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR2:[0-9]+]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR3:[0-9]+]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR3:[0-9]+]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind nobuiltin
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length3(ptr %X, ptr %Y) nounwind optsize {
+; X64-LABEL: define i32 @length3(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length3(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length3(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
+  ret i32 %m
+}
+
+define i1 @length3_eq(ptr %X, ptr %Y) nounwind optsize {
+; X64-LABEL: define i1 @length3_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX1-LABEL: define i1 @length3_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX2-LABEL: define i1 @length3_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP12]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length4(ptr %X, ptr %Y) nounwind optsize {
+; X64-LABEL: define i32 @length4(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX1-LABEL: define i32 @length4(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX1-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX2-LABEL: define i32 @length4(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX2-NEXT:    ret i32 [[TMP9]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  ret i32 %m
+}
+
+define i1 @length4_eq(ptr %X, ptr %Y) nounwind optsize {
+; X64-LABEL: define i1 @length4_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX1-LABEL: define i1 @length4_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX2-LABEL: define i1 @length4_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP3]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_eq_const(ptr %X) nounwind optsize {
+; X64-LABEL: define i1 @length4_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length4_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length4_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i64 4) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length5(ptr %X, ptr %Y) nounwind optsize {
+; X64-LABEL: define i32 @length5(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length5(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length5(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
+  ret i32 %m
+}
+
+define i1 @length5_eq(ptr %X, ptr %Y) nounwind optsize {
+; X64-LABEL: define i1 @length5_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX1-LABEL: define i1 @length5_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX2-LABEL: define i1 @length5_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP12]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length8(ptr %X, ptr %Y) nounwind optsize {
+; X64-LABEL: define i32 @length8(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX1-LABEL: define i32 @length8(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX1-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX2-LABEL: define i32 @length8(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX2-NEXT:    ret i32 [[TMP9]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
+  ret i32 %m
+}
+
+define i1 @length8_eq(ptr %X, ptr %Y) nounwind optsize {
+; X64-LABEL: define i1 @length8_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length8_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length8_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length8_eq_const(ptr %X) nounwind optsize {
+; X64-LABEL: define i1 @length8_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX1-LABEL: define i1 @length8_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX2-LABEL: define i1 @length8_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP2]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 8) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length12_eq(ptr %X, ptr %Y) nounwind optsize {
+; X64-LABEL: define i1 @length12_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX1-LABEL: define i1 @length12_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX2-LABEL: define i1 @length12_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP12]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length12(ptr %X, ptr %Y) nounwind optsize {
+; X64-LABEL: define i32 @length12(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length12(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-AVX1-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-AVX1-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length12(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-AVX2-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-AVX2-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
+  ret i32 %m
+}
+
+; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
+
+define i32 @length16(ptr %X, ptr %Y) nounwind optsize {
+; X64-LABEL: define i32 @length16(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length16(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length16(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 16) nounwind
+  ret i32 %m
+}
+
+define i1 @length16_eq(ptr %x, ptr %y) nounwind optsize {
+; X64-SSE2-LABEL: length16_eq:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    movdqu (%rsi), %xmm1
+; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; X64-SSE2-NEXT:    pmovmskb %xmm1, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    setne %al
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX-LABEL: length16_eq:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    setne %al
+; X64-AVX-NEXT:    retq
+; X64-LABEL: define i1 @length16_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX1-LABEL: define i1 @length16_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX2-LABEL: define i1 @length16_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP3]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_eq_const(ptr %X) nounwind optsize {
+; X64-SSE2-LABEL: length16_eq_const:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    sete %al
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX-LABEL: length16_eq_const:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    sete %al
+; X64-AVX-NEXT:    retq
+; X64-LABEL: define i1 @length16_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length16_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length16_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 16) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
+
+define i32 @length24(ptr %X, ptr %Y) nounwind optsize {
+; X64-LABEL: define i32 @length24(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 24) #[[ATTR3:[0-9]+]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length24(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 24) #[[ATTR4:[0-9]+]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length24(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 24) #[[ATTR4:[0-9]+]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 24) nounwind
+  ret i32 %m
+}
+
+define i1 @length24_eq(ptr %x, ptr %y) nounwind optsize {
+; X64-SSE2-LABEL: length24_eq:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    movdqu (%rsi), %xmm1
+; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; X64-SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X64-SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
+; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
+; X64-SSE2-NEXT:    pand %xmm1, %xmm2
+; X64-SSE2-NEXT:    pmovmskb %xmm2, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    sete %al
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX-LABEL: length24_eq:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; X64-AVX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; X64-AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
+; X64-AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    sete %al
+; X64-AVX-NEXT:    retq
+; X64-LABEL: define i1 @length24_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length24_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length24_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_eq_const(ptr %X) nounwind optsize {
+; X64-SSE2-LABEL: length24_eq_const:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE2-NEXT:    pand %xmm1, %xmm0
+; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    setne %al
+; X64-SSE2-NEXT:    retq
+;
+; X64-AVX-LABEL: length24_eq_const:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    setne %al
+; X64-AVX-NEXT:    retq
+; X64-LABEL: define i1 @length24_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX1-LABEL: define i1 @length24_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX2-LABEL: define i1 @length24_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP8]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 24) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length32(ptr %X, ptr %Y) nounwind optsize {
+; X64-LABEL: define i32 @length32(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 32) #[[ATTR3]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length32(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 32) #[[ATTR4]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length32(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 32) #[[ATTR4]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 32) nounwind
+  ret i32 %m
+}
+
+; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
+
+define i1 @length32_eq(ptr %x, ptr %y) nounwind optsize {
+; X64-SSE2-LABEL: length32_eq:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-SSE2-NEXT:    movdqu (%rsi), %xmm2
+; X64-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
+; X64-SSE2-NEXT:    movdqu 16(%rsi), %xmm0
+; X64-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
+; X64-SSE2-NEXT:    pand %xmm2, %xmm0
+; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    sete %al
+; X64-SSE2-NEXT:    retq
+;
+; X64-LABEL: define i1 @length32_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length32_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = icmp ne i256 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length32_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i256 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_const(ptr %X) nounwind optsize {
+; X64-SSE2-LABEL: length32_eq_const:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT:    movdqu 16(%rdi), %xmm1
+; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE2-NEXT:    pand %xmm1, %xmm0
+; X64-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X64-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X64-SSE2-NEXT:    setne %al
+; X64-SSE2-NEXT:    retq
+;
+; X64-LABEL: define i1 @length32_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X64-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-NEXT:    ret i1 [[TMP7]]
+;
+; X64-AVX1-LABEL: define i1 @length32_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = icmp ne i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX2-LABEL: define i1 @length32_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP2]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 32) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length64(ptr %X, ptr %Y) nounwind optsize {
+; X64-LABEL: define i32 @length64(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR3]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length64(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR4]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length64(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR4]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 64) nounwind
+  ret i32 %m
+}
+
+define i1 @length64_eq(ptr %x, ptr %y) nounwind optsize {
+; X64-SSE2-LABEL: length64_eq:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    pushq %rax
+; X64-SSE2-NEXT:    movl $64, %edx
+; X64-SSE2-NEXT:    callq memcmp
+; X64-SSE2-NEXT:    testl %eax, %eax
+; X64-SSE2-NEXT:    setne %al
+; X64-SSE2-NEXT:    popq %rcx
+; X64-SSE2-NEXT:    retq
+;
+; X64-LABEL: define i1 @length64_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR3]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length64_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX2-LABEL: define i1 @length64_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP10]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_eq_const(ptr %X) nounwind optsize {
+; X64-SSE2-LABEL: length64_eq_const:
+; X64-SSE2:       # %bb.0:
+; X64-SSE2-NEXT:    pushq %rax
+; X64-SSE2-NEXT:    movl $.L.str, %esi
+; X64-SSE2-NEXT:    movl $64, %edx
+; X64-SSE2-NEXT:    callq memcmp
+; X64-SSE2-NEXT:    testl %eax, %eax
+; X64-SSE2-NEXT:    sete %al
+; X64-SSE2-NEXT:    popq %rcx
+; X64-SSE2-NEXT:    retq
+;
+; X64-LABEL: define i1 @length64_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 64) #[[ATTR3]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length64_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length64_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 64) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @bcmp_length2(ptr %X, ptr %Y) nounwind optsize {
+; X64-LABEL: define i32 @bcmp_length2(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    ret i32 [[TMP4]]
+;
+; X64-AVX1-LABEL: define i32 @bcmp_length2(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX1-NEXT:    ret i32 [[TMP4]]
+;
+; X64-AVX2-LABEL: define i32 @bcmp_length2(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX2-NEXT:    ret i32 [[TMP4]]
+;
+  %m = tail call i32 @bcmp(ptr %X, ptr %Y, i64 2) nounwind
+  ret i32 %m
+}
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-pgso-x32.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-pgso-x32.ll
new file mode 100644
index 00000000000000..a8b054cd20e270
--- /dev/null
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-pgso-x32.ll
@@ -0,0 +1,887 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -passes=expand-memcmp -mtriple=i686-unknown-unknown -mattr=cmov < %s | FileCheck %s --check-prefix=X86
+; RUN: opt -S -passes=expand-memcmp -mtriple=i686-unknown-unknown -mattr=+sse2 < %s | FileCheck %s  --check-prefix=X86-SSE2
+
+; This tests codegen time inlining/optimization of memcmp
+; rdar://6480398
+
+ at .str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1
+
+declare dso_local i32 @memcmp(ptr, ptr, i32)
+declare dso_local i32 @bcmp(ptr, ptr, i32)
+
+define i32 @length2(ptr %X, ptr %Y) nounwind !prof !14 {
+; X86-LABEL: define i32 @length2(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF14:![0-9]+]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    ret i32 [[TMP7]]
+;
+; X86-SSE2-LABEL: define i32 @length2(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF14:![0-9]+]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    ret i32 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
+  ret i32 %m
+}
+
+define i1 @length2_eq(ptr %X, ptr %Y) nounwind !prof !14 {
+; X86-LABEL: define i1 @length2_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length2_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_const(ptr %X) nounwind !prof !14 {
+; X86-LABEL: define i1 @length2_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X86-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-NEXT:    ret i1 [[TMP2]]
+;
+; X86-SSE2-LABEL: define i1 @length2_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP2]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i32 2) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind !prof !14 {
+; X86-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 2) #[[ATTR3:[0-9]+]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 2) #[[ATTR3:[0-9]+]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind nobuiltin
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length3(ptr %X, ptr %Y) nounwind !prof !14 {
+; X86-LABEL: define i32 @length3(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    br label [[LOADBB:%.*]]
+; X86:       res_block:
+; X86-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86:       loadbb:
+; X86-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X86-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X86-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86:       loadbb1:
+; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    br label [[ENDBLOCK]]
+; X86:       endblock:
+; X86-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE2-LABEL: define i32 @length3(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE2:       res_block:
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE2:       loadbb:
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X86-SSE2-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86-SSE2:       loadbb1:
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    br label [[ENDBLOCK]]
+; X86-SSE2:       endblock:
+; X86-SSE2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE2-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind
+  ret i32 %m
+}
+
+define i1 @length3_eq(ptr %X, ptr %Y) nounwind !prof !14 {
+; X86-LABEL: define i1 @length3_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X86-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X86-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X86-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X86-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X86-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-NEXT:    ret i1 [[TMP12]]
+;
+; X86-SSE2-LABEL: define i1 @length3_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP12]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length4(ptr %X, ptr %Y) nounwind !prof !14 {
+; X86-LABEL: define i32 @length4(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X86-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X86-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X86-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X86-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X86-NEXT:    ret i32 [[TMP9]]
+;
+; X86-SSE2-LABEL: define i32 @length4(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X86-SSE2-NEXT:    ret i32 [[TMP9]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
+  ret i32 %m
+}
+
+define i1 @length4_eq(ptr %X, ptr %Y) nounwind !prof !14 {
+; X86-LABEL: define i1 @length4_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-NEXT:    ret i1 [[TMP3]]
+;
+; X86-SSE2-LABEL: define i1 @length4_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP3]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_eq_const(ptr %X) nounwind !prof !14 {
+; X86-LABEL: define i1 @length4_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X86-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length4_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i32 4) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length5(ptr %X, ptr %Y) nounwind !prof !14 {
+; X86-LABEL: define i32 @length5(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    br label [[LOADBB:%.*]]
+; X86:       res_block:
+; X86-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86:       loadbb:
+; X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86:       loadbb1:
+; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    br label [[ENDBLOCK]]
+; X86:       endblock:
+; X86-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE2-LABEL: define i32 @length5(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE2:       res_block:
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE2:       loadbb:
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86-SSE2:       loadbb1:
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    br label [[ENDBLOCK]]
+; X86-SSE2:       endblock:
+; X86-SSE2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE2-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
+  ret i32 %m
+}
+
+define i1 @length5_eq(ptr %X, ptr %Y) nounwind !prof !14 {
+; X86-LABEL: define i1 @length5_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X86-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X86-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X86-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X86-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X86-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-NEXT:    ret i1 [[TMP12]]
+;
+; X86-SSE2-LABEL: define i1 @length5_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP12]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length8(ptr %X, ptr %Y) nounwind !prof !14 {
+; X86-LABEL: define i32 @length8(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    br label [[LOADBB:%.*]]
+; X86:       res_block:
+; X86-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86:       loadbb:
+; X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86:       loadbb1:
+; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86:       endblock:
+; X86-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE2-LABEL: define i32 @length8(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE2:       res_block:
+; X86-SSE2-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-SSE2-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE2:       loadbb:
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE2:       loadbb1:
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE2-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE2:       endblock:
+; X86-SSE2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE2-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 8) nounwind
+  ret i32 %m
+}
+
+define i1 @length8_eq(ptr %X, ptr %Y) nounwind !prof !14 {
+; X86-LABEL: define i1 @length8_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X86-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length8_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 8) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length8_eq_const(ptr %X) nounwind !prof !14 {
+; X86-LABEL: define i1 @length8_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = xor i32 [[TMP1]], 858927408
+; X86-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 1
+; X86-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP4]], 926299444
+; X86-NEXT:    [[TMP6:%.*]] = or i32 [[TMP2]], [[TMP5]]
+; X86-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[TMP6]], 0
+; X86-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-NEXT:    ret i1 [[TMP7]]
+;
+; X86-SSE2-LABEL: define i1 @length8_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = xor i32 [[TMP1]], 858927408
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 1
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP4]], 926299444
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = or i32 [[TMP2]], [[TMP5]]
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[TMP6]], 0
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 8) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length12_eq(ptr %X, ptr %Y) nounwind !prof !14 {
+; X86-LABEL: define i1 @length12_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 12) #[[ATTR4:[0-9]+]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length12_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 12) #[[ATTR4:[0-9]+]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 12) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length12(ptr %X, ptr %Y) nounwind !prof !14 {
+; X86-LABEL: define i32 @length12(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 12) #[[ATTR4]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length12(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 12) #[[ATTR4]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 12) nounwind
+  ret i32 %m
+}
+
+; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
+
+define i32 @length16(ptr %X, ptr %Y) nounwind !prof !14 {
+; X86-LABEL: define i32 @length16(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR4]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length16(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR4]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 16) nounwind
+  ret i32 %m
+}
+
+define i1 @length16_eq(ptr %x, ptr %y) nounwind !prof !14 {
+; X86-NOSSE-LABEL: length16_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $16
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length16_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR4]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length16_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP3]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 16) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_eq_const(ptr %X) nounwind !prof !14 {
+; X86-NOSSE-LABEL: length16_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $16
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length16_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 16) #[[ATTR4]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length16_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 16) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
+
+define i32 @length24(ptr %X, ptr %Y) nounwind !prof !14 {
+; X86-LABEL: define i32 @length24(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR4]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length24(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR4]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 24) nounwind
+  ret i32 %m
+}
+
+define i1 @length24_eq(ptr %x, ptr %y) nounwind !prof !14 {
+; X86-NOSSE-LABEL: length24_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $24
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length24_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR4]]
+; X86-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length24_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 24) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_eq_const(ptr %X) nounwind !prof !14 {
+; X86-NOSSE-LABEL: length24_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $24
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length24_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 24) #[[ATTR4]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length24_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 68051240286688436651889234231545575736
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 24) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length32(ptr %X, ptr %Y) nounwind !prof !14 {
+; X86-LABEL: define i32 @length32(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR4]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length32(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR4]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 32) nounwind
+  ret i32 %m
+}
+
+; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
+
+define i1 @length32_eq(ptr %x, ptr %y) nounwind !prof !14 {
+; X86-NOSSE-LABEL: length32_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $32
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length32_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR4]]
+; X86-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length32_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_const(ptr %X) nounwind !prof !14 {
+; X86-NOSSE-LABEL: length32_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $32
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length32_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 32) #[[ATTR4]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length32_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 32) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length64(ptr %X, ptr %Y) nounwind !prof !14 {
+; X86-LABEL: define i32 @length64(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR4]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length64(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR4]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 64) nounwind
+  ret i32 %m
+}
+
+define i1 @length64_eq(ptr %x, ptr %y) nounwind !prof !14 {
+; X86-LABEL: define i1 @length64_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR4]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length64_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR4]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 64) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_eq_const(ptr %X) nounwind !prof !14 {
+; X86-LABEL: define i1 @length64_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 64) #[[ATTR4]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length64_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 64) #[[ATTR4]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 64) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @bcmp_length2(ptr %X, ptr %Y) nounwind !prof !14 {
+; X86-LABEL: define i32 @bcmp_length2(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-NEXT:    ret i32 [[TMP4]]
+;
+; X86-SSE2-LABEL: define i32 @bcmp_length2(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE2-NEXT:    ret i32 [[TMP4]]
+;
+  %m = tail call i32 @bcmp(ptr %X, ptr %Y, i32 2) nounwind
+  ret i32 %m
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i32 10000}
+!4 = !{!"MaxCount", i32 10}
+!5 = !{!"MaxInternalCount", i32 1}
+!6 = !{!"MaxFunctionCount", i32 1000}
+!7 = !{!"NumCounts", i32 3}
+!8 = !{!"NumFunctions", i32 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i32 100, i32 1}
+!12 = !{i32 999000, i32 100, i32 1}
+!13 = !{i32 999999, i32 1, i32 2}
+!14 = !{!"function_entry_count", i32 0}
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-pgso.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-pgso.ll
new file mode 100644
index 00000000000000..1507cbdc4e86ec
--- /dev/null
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-pgso.ll
@@ -0,0 +1,1347 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -passes=expand-memcmp -mtriple=x86_64-unknown-unknown < %s | FileCheck %s --check-prefix=X64
+; RUN: opt -S -passes=expand-memcmp -mtriple=x86_64-unknown-unknown -mattr=avx < %s | FileCheck %s --check-prefix=X64-AVX1
+; RUN: opt -S -passes=expand-memcmp -mtriple=x86_64-unknown-unknown -mattr=avx2 < %s | FileCheck %s --check-prefix=X64-AVX2
+
+; This tests codegen time inlining/optimization of memcmp
+; rdar://6480398
+
+ at .str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1
+
+declare dso_local i32 @memcmp(ptr, ptr, i64)
+declare dso_local i32 @bcmp(ptr, ptr, i64)
+
+define i32 @length2(ptr %X, ptr %Y) nounwind !prof !14 {
+; X64-LABEL: define i32 @length2(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0:[0-9]+]] !prof [[PROF14:![0-9]+]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    ret i32 [[TMP7]]
+;
+; X64-AVX1-LABEL: define i32 @length2(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF14:![0-9]+]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    ret i32 [[TMP7]]
+;
+; X64-AVX2-LABEL: define i32 @length2(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] !prof [[PROF14:![0-9]+]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    ret i32 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+  ret i32 %m
+}
+
+define i1 @length2_eq(ptr %X, ptr %Y) nounwind !prof !14 {
+; X64-LABEL: define i1 @length2_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length2_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length2_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_const(ptr %X) nounwind !prof !14 {
+; X64-LABEL: define i1 @length2_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX1-LABEL: define i1 @length2_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX2-LABEL: define i1 @length2_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP2]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind !prof !14 {
+; X64-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR2:[0-9]+]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR3:[0-9]+]]
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR3:[0-9]+]]
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind nobuiltin
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length3(ptr %X, ptr %Y) nounwind !prof !14 {
+; X64-LABEL: define i32 @length3(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length3(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length3(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
+  ret i32 %m
+}
+
+define i1 @length3_eq(ptr %X, ptr %Y) nounwind !prof !14 {
+; X64-LABEL: define i1 @length3_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX1-LABEL: define i1 @length3_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX2-LABEL: define i1 @length3_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP12]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length4(ptr %X, ptr %Y) nounwind !prof !14 {
+; X64-LABEL: define i32 @length4(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX1-LABEL: define i32 @length4(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX1-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX2-LABEL: define i32 @length4(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX2-NEXT:    ret i32 [[TMP9]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  ret i32 %m
+}
+
+define i1 @length4_eq(ptr %X, ptr %Y) nounwind !prof !14 {
+; X64-LABEL: define i1 @length4_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX1-LABEL: define i1 @length4_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX2-LABEL: define i1 @length4_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP3]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_eq_const(ptr %X) nounwind !prof !14 {
+; X64-LABEL: define i1 @length4_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length4_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length4_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i64 4) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length5(ptr %X, ptr %Y) nounwind !prof !14 {
+; X64-LABEL: define i32 @length5(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length5(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length5(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br label [[ENDBLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
+  ret i32 %m
+}
+
+define i1 @length5_eq(ptr %X, ptr %Y) nounwind !prof !14 {
+; X64-LABEL: define i1 @length5_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX1-LABEL: define i1 @length5_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX2-LABEL: define i1 @length5_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP12]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length8(ptr %X, ptr %Y) nounwind !prof !14 {
+; X64-LABEL: define i32 @length8(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX1-LABEL: define i32 @length8(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX1-NEXT:    ret i32 [[TMP9]]
+;
+; X64-AVX2-LABEL: define i32 @length8(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-AVX2-NEXT:    ret i32 [[TMP9]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
+  ret i32 %m
+}
+
+define i1 @length8_eq(ptr %X, ptr %Y) nounwind !prof !14 {
+; X64-LABEL: define i1 @length8_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length8_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length8_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length8_eq_const(ptr %X) nounwind !prof !14 {
+; X64-LABEL: define i1 @length8_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX1-LABEL: define i1 @length8_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX2-LABEL: define i1 @length8_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP2]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 8) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length12_eq(ptr %X, ptr %Y) nounwind !prof !14 {
+; X64-LABEL: define i1 @length12_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX1-LABEL: define i1 @length12_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP12]]
+;
+; X64-AVX2-LABEL: define i1 @length12_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP12]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length12(ptr %X, ptr %Y) nounwind !prof !14 {
+; X64-LABEL: define i32 @length12(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length12(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-AVX1-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-AVX1-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-AVX1-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length12(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-AVX2-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-AVX2-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-AVX2-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
+  ret i32 %m
+}
+
+; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
+
+define i32 @length16(ptr %X, ptr %Y) nounwind !prof !14 {
+; X64-LABEL: define i32 @length16(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX1-LABEL: define i32 @length16(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX1:       res_block:
+; X64-AVX1-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX1-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX1:       loadbb:
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX1-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX1:       loadbb1:
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX1-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX1-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX1-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX1:       endblock:
+; X64-AVX1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X64-AVX2-LABEL: define i32 @length16(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    br label [[LOADBB:%.*]]
+; X64-AVX2:       res_block:
+; X64-AVX2-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-AVX2-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-AVX2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64-AVX2:       loadbb:
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-AVX2-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-AVX2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-AVX2:       loadbb1:
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-AVX2-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-AVX2-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-AVX2-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-AVX2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-AVX2:       endblock:
+; X64-AVX2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-AVX2-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 16) nounwind
+  ret i32 %m
+}
+
+define i1 @length16_eq(ptr %x, ptr %y) nounwind !prof !14 {
+;
+; X64-AVX-LABEL: length16_eq:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    setne %al
+; X64-AVX-NEXT:    retq
+; X64-LABEL: define i1 @length16_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX1-LABEL: define i1 @length16_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP3]]
+;
+; X64-AVX2-LABEL: define i1 @length16_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP3]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_eq_const(ptr %X) nounwind !prof !14 {
+;
+; X64-AVX-LABEL: length16_eq_const:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    sete %al
+; X64-AVX-NEXT:    retq
+; X64-LABEL: define i1 @length16_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length16_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length16_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 16) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
+
+define i32 @length24(ptr %X, ptr %Y) nounwind !prof !14 {
+; X64-LABEL: define i32 @length24(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 24) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length24(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 24) #[[ATTR4:[0-9]+]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length24(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 24) #[[ATTR4:[0-9]+]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 24) nounwind
+  ret i32 %m
+}
+
+define i1 @length24_eq(ptr %x, ptr %y) nounwind !prof !14 {
+;
+; X64-AVX-LABEL: length24_eq:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; X64-AVX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
+; X64-AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; X64-AVX-NEXT:    vpxor (%rsi), %xmm0, %xmm0
+; X64-AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    sete %al
+; X64-AVX-NEXT:    retq
+; X64-LABEL: define i1 @length24_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length24_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-AVX1-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-AVX1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length24_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; X64-AVX2-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; X64-AVX2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_eq_const(ptr %X) nounwind !prof !14 {
+;
+; X64-AVX-LABEL: length24_eq_const:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; X64-AVX-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vptest %xmm0, %xmm0
+; X64-AVX-NEXT:    setne %al
+; X64-AVX-NEXT:    retq
+; X64-LABEL: define i1 @length24_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX1-LABEL: define i1 @length24_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP8]]
+;
+; X64-AVX2-LABEL: define i1 @length24_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP8]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 24) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length32(ptr %X, ptr %Y) nounwind !prof !14 {
+; X64-LABEL: define i32 @length32(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 32) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length32(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 32) #[[ATTR4]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length32(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 32) #[[ATTR4]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 32) nounwind
+  ret i32 %m
+}
+
+; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
+
+define i1 @length32_eq(ptr %x, ptr %y) nounwind !prof !14 {
+;
+; X64-LABEL: define i1 @length32_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X64-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X64-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X64-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X64-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X64-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X64-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length32_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = icmp ne i256 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX1-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX2-LABEL: define i1 @length32_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i256 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; X64-AVX2-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_const(ptr %X) nounwind !prof !14 {
+;
+; X64-LABEL: define i1 @length32_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X64-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X64-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X64-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X64-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X64-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X64-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-NEXT:    ret i1 [[TMP7]]
+;
+; X64-AVX1-LABEL: define i1 @length32_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = icmp ne i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP2]]
+;
+; X64-AVX2-LABEL: define i1 @length32_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = icmp ne i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP2]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 32) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length64(ptr %X, ptr %Y) nounwind !prof !14 {
+; X64-LABEL: define i32 @length64(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR0]]
+; X64-NEXT:    ret i32 [[M]]
+;
+; X64-AVX1-LABEL: define i32 @length64(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR4]]
+; X64-AVX1-NEXT:    ret i32 [[M]]
+;
+; X64-AVX2-LABEL: define i32 @length64(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR4]]
+; X64-AVX2-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 64) nounwind
+  ret i32 %m
+}
+
+define i1 @length64_eq(ptr %x, ptr %y) nounwind !prof !14 {
+;
+; X64-LABEL: define i1 @length64_eq(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 64) #[[ATTR0]]
+; X64-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X64-NEXT:    ret i1 [[CMP]]
+;
+; X64-AVX1-LABEL: define i1 @length64_eq(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX1-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX1-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX1-NEXT:    ret i1 [[TMP10]]
+;
+; X64-AVX2-LABEL: define i1 @length64_eq(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i256, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = xor i256 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 32
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = load i256, ptr [[TMP4]], align 1
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = load i256, ptr [[TMP5]], align 1
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = xor i256 [[TMP6]], [[TMP7]]
+; X64-AVX2-NEXT:    [[TMP9:%.*]] = or i256 [[TMP3]], [[TMP8]]
+; X64-AVX2-NEXT:    [[TMP10:%.*]] = icmp ne i256 [[TMP9]], 0
+; X64-AVX2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64-AVX2-NEXT:    ret i1 [[TMP10]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_eq_const(ptr %X) nounwind !prof !14 {
+;
+; X64-LABEL: define i1 @length64_eq_const(
+; X64-SAME: ptr [[X:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i64 64) #[[ATTR0]]
+; X64-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X64-NEXT:    ret i1 [[C]]
+;
+; X64-AVX1-LABEL: define i1 @length64_eq_const(
+; X64-AVX1-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX1-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX1-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX1-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX1-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX1-NEXT:    ret i1 [[C]]
+;
+; X64-AVX2-LABEL: define i1 @length64_eq_const(
+; X64-AVX2-SAME: ptr [[X:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i256, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = xor i256 [[TMP1]], 22248533154802671749360035741805466271990224543450513484713781259640245465392
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 32
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = load i256, ptr [[TMP3]], align 1
+; X64-AVX2-NEXT:    [[TMP5:%.*]] = xor i256 [[TMP4]], 23156637116659864195145731957391441738757757709540232586892941433547502400306
+; X64-AVX2-NEXT:    [[TMP6:%.*]] = or i256 [[TMP2]], [[TMP5]]
+; X64-AVX2-NEXT:    [[TMP7:%.*]] = icmp ne i256 [[TMP6]], 0
+; X64-AVX2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X64-AVX2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP8]], 0
+; X64-AVX2-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 64) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @bcmp_length2(ptr %X, ptr %Y) nounwind !prof !14 {
+; X64-LABEL: define i32 @bcmp_length2(
+; X64-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR0]] !prof [[PROF14]] {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    ret i32 [[TMP4]]
+;
+; X64-AVX1-LABEL: define i32 @bcmp_length2(
+; X64-AVX1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX1-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-AVX1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX1-NEXT:    ret i32 [[TMP4]]
+;
+; X64-AVX2-LABEL: define i32 @bcmp_length2(
+; X64-AVX2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] !prof [[PROF14]] {
+; X64-AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-AVX2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-AVX2-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-AVX2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-AVX2-NEXT:    ret i32 [[TMP4]]
+;
+  %m = tail call i32 @bcmp(ptr %X, ptr %Y, i64 2) nounwind
+  ret i32 %m
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999000, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 0}
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32-2.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32-2.ll
new file mode 100644
index 00000000000000..8c86c110c7bb2b
--- /dev/null
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32-2.ll
@@ -0,0 +1,4813 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -passes=expand-memcmp -mtriple=i686-unknown-unknown -mattr=cmov < %s | FileCheck %s --check-prefix=X86
+; RUN: opt -S -passes=expand-memcmp -mtriple=i686-unknown-unknown -mattr=+sse < %s | FileCheck %s --check-prefix=X86-SSE1
+; RUN: opt -S -passes=expand-memcmp -mtriple=i686-unknown-unknown -mattr=+sse2 < %s | FileCheck %s --check-prefix=X86-SSE2
+; RUN: opt -S -passes=expand-memcmp -mtriple=i686-unknown-unknown -mattr=+sse4.1 < %s | FileCheck %s --check-prefix=X86-SSE41
+
+; This tests codegen time inlining/optimization of memcmp
+; rdar://6480398
+
+ at .str = private constant [513 x i8] c"01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901\00", align 1
+
+declare dso_local i32 @memcmp(ptr, ptr, i32)
+
+define i32 @length0(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length0(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X86-NEXT:    ret i32 0
+;
+; X86-SSE1-LABEL: define i32 @length0(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X86-SSE1-NEXT:    ret i32 0
+;
+; X86-SSE2-LABEL: define i32 @length0(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X86-SSE2-NEXT:    ret i32 0
+;
+; X86-SSE41-LABEL: define i32 @length0(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1:[0-9]+]] {
+; X86-SSE41-NEXT:    ret i32 0
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 0) nounwind
+  ret i32 %m
+  }
+
+define i1 @length0_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length0_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    ret i1 true
+;
+; X86-SSE1-LABEL: define i1 @length0_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    ret i1 true
+;
+; X86-SSE2-LABEL: define i1 @length0_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    ret i1 true
+;
+; X86-SSE41-LABEL: define i1 @length0_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    ret i1 true
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 0) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length0_lt(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length0_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    ret i1 false
+;
+; X86-SSE1-LABEL: define i1 @length0_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    ret i1 false
+;
+; X86-SSE2-LABEL: define i1 @length0_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    ret i1 false
+;
+; X86-SSE41-LABEL: define i1 @length0_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    ret i1 false
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 0) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length2(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length2(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    ret i32 [[TMP7]]
+;
+; X86-SSE1-LABEL: define i32 @length2(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-SSE1-NEXT:    ret i32 [[TMP7]]
+;
+; X86-SSE2-LABEL: define i32 @length2(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    ret i32 [[TMP7]]
+;
+; X86-SSE41-LABEL: define i32 @length2(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-SSE41-NEXT:    ret i32 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
+  ret i32 %m
+}
+
+define i32 @length2_const(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length2_const(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X86-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X86-NEXT:    ret i32 [[TMP4]]
+;
+; X86-SSE1-LABEL: define i32 @length2_const(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X86-SSE1-NEXT:    ret i32 [[TMP4]]
+;
+; X86-SSE2-LABEL: define i32 @length2_const(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X86-SSE2-NEXT:    ret i32 [[TMP4]]
+;
+; X86-SSE41-LABEL: define i32 @length2_const(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X86-SSE41-NEXT:    ret i32 [[TMP4]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 2) nounwind
+  ret i32 %m
+}
+
+define i1 @length2_gt_const(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length2_gt_const(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X86-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X86-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP4]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length2_gt_const(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP4]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length2_gt_const(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP4]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length2_gt_const(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 12594
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP4]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 2) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length2_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length2_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length2_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length2_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length2_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length2_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length2_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length2_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_gt(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length2_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length2_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length2_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length2_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length2_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X86-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-NEXT:    ret i1 [[TMP2]]
+;
+; X86-SSE1-LABEL: define i1 @length2_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-SSE1-NEXT:    ret i1 [[TMP2]]
+;
+; X86-SSE2-LABEL: define i1 @length2_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP2]]
+;
+; X86-SSE41-LABEL: define i1 @length2_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP2]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 2) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 2) #[[ATTR4:[0-9]+]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 2) #[[ATTR4:[0-9]+]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 2) #[[ATTR4:[0-9]+]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 2) #[[ATTR4:[0-9]+]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 2) nounwind nobuiltin
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length3(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length3(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    br label [[LOADBB:%.*]]
+; X86:       res_block:
+; X86-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86:       loadbb:
+; X86-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X86-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X86-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86:       loadbb1:
+; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    br label [[ENDBLOCK]]
+; X86:       endblock:
+; X86-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE1-LABEL: define i32 @length3(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE1:       res_block:
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE1:       loadbb:
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X86-SSE1-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X86-SSE1-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86-SSE1:       loadbb1:
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-SSE1-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-SSE1-NEXT:    br label [[ENDBLOCK]]
+; X86-SSE1:       endblock:
+; X86-SSE1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE2-LABEL: define i32 @length3(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE2:       res_block:
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE2:       loadbb:
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X86-SSE2-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86-SSE2:       loadbb1:
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    br label [[ENDBLOCK]]
+; X86-SSE2:       endblock:
+; X86-SSE2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE41-LABEL: define i32 @length3(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE41:       res_block:
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE41:       loadbb:
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X86-SSE41-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X86-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86-SSE41:       loadbb1:
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-SSE41-NEXT:    br label [[ENDBLOCK]]
+; X86-SSE41:       endblock:
+; X86-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind
+  ret i32 %m
+}
+
+define i1 @length3_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length3_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X86-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X86-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X86-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X86-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X86-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-NEXT:    ret i1 [[TMP12]]
+;
+; X86-SSE1-LABEL: define i1 @length3_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X86-SSE1-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X86-SSE1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-SSE1-NEXT:    ret i1 [[TMP12]]
+;
+; X86-SSE2-LABEL: define i1 @length3_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP12]]
+;
+; X86-SSE41-LABEL: define i1 @length3_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP12]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 3) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length4(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length4(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X86-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X86-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X86-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X86-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X86-NEXT:    ret i32 [[TMP9]]
+;
+; X86-SSE1-LABEL: define i32 @length4(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X86-SSE1-NEXT:    ret i32 [[TMP9]]
+;
+; X86-SSE2-LABEL: define i32 @length4(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X86-SSE2-NEXT:    ret i32 [[TMP9]]
+;
+; X86-SSE41-LABEL: define i32 @length4(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X86-SSE41-NEXT:    ret i32 [[TMP9]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
+  ret i32 %m
+}
+
+define i1 @length4_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length4_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-NEXT:    ret i1 [[TMP3]]
+;
+; X86-SSE1-LABEL: define i1 @length4_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE1-NEXT:    ret i1 [[TMP3]]
+;
+; X86-SSE2-LABEL: define i1 @length4_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP3]]
+;
+; X86-SSE41-LABEL: define i1 @length4_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP3]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_lt(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length4_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X86-NEXT:    ret i1 [[TMP5]]
+;
+; X86-SSE1-LABEL: define i1 @length4_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X86-SSE1-NEXT:    ret i1 [[TMP5]]
+;
+; X86-SSE2-LABEL: define i1 @length4_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X86-SSE2-NEXT:    ret i1 [[TMP5]]
+;
+; X86-SSE41-LABEL: define i1 @length4_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X86-SSE41-NEXT:    ret i1 [[TMP5]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_gt(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length4_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X86-NEXT:    ret i1 [[TMP5]]
+;
+; X86-SSE1-LABEL: define i1 @length4_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X86-SSE1-NEXT:    ret i1 [[TMP5]]
+;
+; X86-SSE2-LABEL: define i1 @length4_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X86-SSE2-NEXT:    ret i1 [[TMP5]]
+;
+; X86-SSE41-LABEL: define i1 @length4_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X86-SSE41-NEXT:    ret i1 [[TMP5]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 4) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length4_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X86-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length4_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length4_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length4_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 4) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length5(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length5(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    br label [[LOADBB:%.*]]
+; X86:       res_block:
+; X86-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86:       loadbb:
+; X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86:       loadbb1:
+; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    br label [[ENDBLOCK]]
+; X86:       endblock:
+; X86-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE1-LABEL: define i32 @length5(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE1:       res_block:
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE1:       loadbb:
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE1-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86-SSE1:       loadbb1:
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-SSE1-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-SSE1-NEXT:    br label [[ENDBLOCK]]
+; X86-SSE1:       endblock:
+; X86-SSE1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE2-LABEL: define i32 @length5(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE2:       res_block:
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE2:       loadbb:
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86-SSE2:       loadbb1:
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    br label [[ENDBLOCK]]
+; X86-SSE2:       endblock:
+; X86-SSE2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE41-LABEL: define i32 @length5(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE41:       res_block:
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE41:       loadbb:
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE41-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86-SSE41:       loadbb1:
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-SSE41-NEXT:    br label [[ENDBLOCK]]
+; X86-SSE41:       endblock:
+; X86-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
+  ret i32 %m
+}
+
+define i1 @length5_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length5_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X86-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X86-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X86-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X86-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X86-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-NEXT:    ret i1 [[TMP12]]
+;
+; X86-SSE1-LABEL: define i1 @length5_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X86-SSE1-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X86-SSE1-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-SSE1-NEXT:    ret i1 [[TMP12]]
+;
+; X86-SSE2-LABEL: define i1 @length5_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP12]]
+;
+; X86-SSE41-LABEL: define i1 @length5_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP12]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length5_lt(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length5_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    br label [[LOADBB:%.*]]
+; X86:       res_block:
+; X86-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86:       loadbb:
+; X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86:       loadbb1:
+; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    br label [[ENDBLOCK]]
+; X86:       endblock:
+; X86-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length5_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE1:       res_block:
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE1:       loadbb:
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE1-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86-SSE1:       loadbb1:
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-SSE1-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-SSE1-NEXT:    br label [[ENDBLOCK]]
+; X86-SSE1:       endblock:
+; X86-SSE1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length5_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE2:       res_block:
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE2:       loadbb:
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86-SSE2:       loadbb1:
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    br label [[ENDBLOCK]]
+; X86-SSE2:       endblock:
+; X86-SSE2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length5_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE41:       res_block:
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE41:       loadbb:
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE41-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X86-SSE41:       loadbb1:
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X86-SSE41-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; X86-SSE41-NEXT:    br label [[ENDBLOCK]]
+; X86-SSE41:       endblock:
+; X86-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 5) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length7(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length7(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    br label [[LOADBB:%.*]]
+; X86:       res_block:
+; X86-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86:       loadbb:
+; X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86:       loadbb1:
+; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86:       endblock:
+; X86-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE1-LABEL: define i32 @length7(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE1:       res_block:
+; X86-SSE1-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-SSE1-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE1:       loadbb:
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE1:       loadbb1:
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE1-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE1-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE1:       endblock:
+; X86-SSE1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE2-LABEL: define i32 @length7(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE2:       res_block:
+; X86-SSE2-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-SSE2-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE2:       loadbb:
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE2:       loadbb1:
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE2-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE2:       endblock:
+; X86-SSE2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE41-LABEL: define i32 @length7(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE41:       res_block:
+; X86-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE41:       loadbb:
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE41-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE41:       loadbb1:
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE41-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE41-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE41:       endblock:
+; X86-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 7) nounwind
+  ret i32 %m
+}
+
+define i1 @length7_lt(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length7_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    br label [[LOADBB:%.*]]
+; X86:       res_block:
+; X86-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86:       loadbb:
+; X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86:       loadbb1:
+; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86:       endblock:
+; X86-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length7_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE1:       res_block:
+; X86-SSE1-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-SSE1-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE1:       loadbb:
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE1:       loadbb1:
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE1-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE1-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE1:       endblock:
+; X86-SSE1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length7_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE2:       res_block:
+; X86-SSE2-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-SSE2-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE2:       loadbb:
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE2:       loadbb1:
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE2-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE2:       endblock:
+; X86-SSE2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length7_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE41:       res_block:
+; X86-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE41:       loadbb:
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE41-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE41:       loadbb1:
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE41-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE41-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE41:       endblock:
+; X86-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 7) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length7_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length7_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X86-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-NEXT:    ret i1 [[TMP10]]
+;
+; X86-SSE1-LABEL: define i1 @length7_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE1-NEXT:    ret i1 [[TMP10]]
+;
+; X86-SSE2-LABEL: define i1 @length7_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP10]]
+;
+; X86-SSE41-LABEL: define i1 @length7_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP10]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 7) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length8(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length8(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    br label [[LOADBB:%.*]]
+; X86:       res_block:
+; X86-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86:       loadbb:
+; X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86:       loadbb1:
+; X86-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86:       endblock:
+; X86-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE1-LABEL: define i32 @length8(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE1:       res_block:
+; X86-SSE1-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-SSE1-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE1-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE1:       loadbb:
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE1-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE1-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE1:       loadbb1:
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE1-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE1-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE1-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE1-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE1:       endblock:
+; X86-SSE1-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE1-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE2-LABEL: define i32 @length8(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE2:       res_block:
+; X86-SSE2-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-SSE2-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE2-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE2:       loadbb:
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE2-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE2-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE2:       loadbb1:
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE2-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE2-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE2-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE2:       endblock:
+; X86-SSE2-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE2-NEXT:    ret i32 [[PHI_RES]]
+;
+; X86-SSE41-LABEL: define i32 @length8(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    br label [[LOADBB:%.*]]
+; X86-SSE41:       res_block:
+; X86-SSE41-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X86-SSE41-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X86-SSE41-NEXT:    br label [[ENDBLOCK:%.*]]
+; X86-SSE41:       loadbb:
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X86-SSE41-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X86-SSE41-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X86-SSE41:       loadbb1:
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X86-SSE41-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X86-SSE41-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X86-SSE41-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X86-SSE41-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X86-SSE41:       endblock:
+; X86-SSE41-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X86-SSE41-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 8) nounwind
+  ret i32 %m
+}
+
+define i1 @length8_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length8_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X86-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length8_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE1-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE1-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X86-SSE1-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length8_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length8_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 8) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length8_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length8_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = xor i32 [[TMP1]], 858927408
+; X86-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 1
+; X86-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP4]], 926299444
+; X86-NEXT:    [[TMP6:%.*]] = or i32 [[TMP2]], [[TMP5]]
+; X86-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[TMP6]], 0
+; X86-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-NEXT:    ret i1 [[TMP7]]
+;
+; X86-SSE1-LABEL: define i1 @length8_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE1-NEXT:    [[TMP2:%.*]] = xor i32 [[TMP1]], 858927408
+; X86-SSE1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 1
+; X86-SSE1-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP4]], 926299444
+; X86-SSE1-NEXT:    [[TMP6:%.*]] = or i32 [[TMP2]], [[TMP5]]
+; X86-SSE1-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[TMP6]], 0
+; X86-SSE1-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE1-NEXT:    ret i1 [[TMP7]]
+;
+; X86-SSE2-LABEL: define i1 @length8_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = xor i32 [[TMP1]], 858927408
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 1
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP4]], 926299444
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = or i32 [[TMP2]], [[TMP5]]
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[TMP6]], 0
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP7]]
+;
+; X86-SSE41-LABEL: define i1 @length8_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = xor i32 [[TMP1]], 858927408
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 1
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP4]], 926299444
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = or i32 [[TMP2]], [[TMP5]]
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[TMP6]], 0
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 8) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length9_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length9_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 9) #[[ATTR5:[0-9]+]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length9_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 9) #[[ATTR5:[0-9]+]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length9_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 9) #[[ATTR5:[0-9]+]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length9_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 9) #[[ATTR5:[0-9]+]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 9) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length10_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length10_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 10) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length10_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 10) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length10_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 10) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length10_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 10) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 10) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length11_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length11_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 11) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length11_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 11) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length11_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 11) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length11_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 11) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 11) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length12_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length12_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 12) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length12_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 12) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length12_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 12) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length12_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 12) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 12) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length12(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length12(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 12) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length12(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 12) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length12(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 12) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length12(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 12) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 12) nounwind
+  ret i32 %m
+}
+
+define i1 @length13_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length13_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 13) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length13_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 13) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length13_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 13) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length13_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 13) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 13) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length14_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length14_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 14) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length14_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 14) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length14_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 14) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length14_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 14) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 14) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length15(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length15(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 15) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length15(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 15) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length15(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 15) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length15(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 15) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 15) nounwind
+  ret i32 %m
+}
+
+define i1 @length15_lt(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length15_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 15) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp slt i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length15_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 15) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp slt i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length15_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 15) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp slt i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length15_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 15) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp slt i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 15) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length15_const(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length15_const(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 15) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length15_const(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 15) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length15_const(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 15) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length15_const(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 15) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 15) nounwind
+  ret i32 %m
+}
+
+define i1 @length15_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length15_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 15) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length15_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 15) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length15_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 15) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length15_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 15) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 15) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length15_gt_const(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @length15_gt_const(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 15) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp sgt i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length15_gt_const(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 15) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp sgt i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length15_gt_const(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 15) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp sgt i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length15_gt_const(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 15) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp sgt i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i32 15) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
+
+define i32 @length16(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length16(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length16(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length16(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length16(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 16) nounwind
+  ret i32 %m
+}
+
+define i1 @length16_eq(ptr %x, ptr %y) nounwind {
+; X86-NOSSE-LABEL: length16_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $16
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length16_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length16_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length16_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP3]]
+;
+; X86-SSE41-LABEL: define i1 @length16_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP3]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 16) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length16_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length16_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length16_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length16_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 16) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length16_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length16_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length16_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length16_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 16) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_eq_const(ptr %X) nounwind {
+; X86-NOSSE-LABEL: length16_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $16
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length16_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 16) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length16_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 16) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length16_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length16_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 16) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
+
+define i32 @length24(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length24(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length24(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length24(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length24(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 24) nounwind
+  ret i32 %m
+}
+
+define i1 @length24_eq(ptr %x, ptr %y) nounwind {
+; X86-NOSSE-LABEL: length24_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $24
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length24_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length24_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length24_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length24_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 24) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length24_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length24_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length24_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length24_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 24) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length24_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length24_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length24_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length24_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 24) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 24) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_eq_const(ptr %X) nounwind {
+; X86-NOSSE-LABEL: length24_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $24
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length24_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 24) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length24_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 24) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length24_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 68051240286688436651889234231545575736
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP7]]
+;
+; X86-SSE41-LABEL: define i1 @length24_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 68051240286688436651889234231545575736
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 24) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length31(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length31(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length31(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length31(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length31(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 31) nounwind
+  ret i32 %m
+}
+
+define i1 @length31_eq(ptr %x, ptr %y) nounwind {
+; X86-NOSSE-LABEL: length31_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $31
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length31_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length31_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length31_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length31_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 31) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length31_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length31_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length31_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length31_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 31) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length31_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length31_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length31_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length31_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 31) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
+; X86-NOSSE-LABEL: length31_eq_prefer128:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $31
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length31_eq_prefer128(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length31_eq_prefer128(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 31) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length31_eq_prefer128(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length31_eq_prefer128(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2:[0-9]+]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 15
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 31) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_eq_const(ptr %X) nounwind {
+; X86-NOSSE-LABEL: length31_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $31
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length31_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 31) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length31_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 31) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length31_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP7]]
+;
+; X86-SSE41-LABEL: define i1 @length31_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 15
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 64100044907875699958541276911416849973
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 31) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length32(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length32(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length32(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length32(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length32(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 32) nounwind
+  ret i32 %m
+}
+
+; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
+
+define i1 @length32_eq(ptr %x, ptr %y) nounwind {
+; X86-NOSSE-LABEL: length32_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $32
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length32_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length32_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length32_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length32_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length32_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length32_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length32_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length32_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 32) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length32_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length32_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length32_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length32_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 32) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
+; X86-NOSSE-LABEL: length32_eq_prefer128:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $32
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length32_eq_prefer128(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length32_eq_prefer128(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 32) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length32_eq_prefer128(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE2-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE2-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE2-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length32_eq_prefer128(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; X86-SSE41-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; X86-SSE41-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; X86-SSE41-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_const(ptr %X) nounwind {
+; X86-NOSSE-LABEL: length32_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $32
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-LABEL: define i1 @length32_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 32) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length32_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 32) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length32_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE2-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE2-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE2-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X86-SSE2-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X86-SSE2-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X86-SSE2-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X86-SSE2-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE2-NEXT:    ret i1 [[TMP7]]
+;
+; X86-SSE41-LABEL: define i1 @length32_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X86-SSE41-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; X86-SSE41-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; X86-SSE41-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; X86-SSE41-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; X86-SSE41-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; X86-SSE41-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; X86-SSE41-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; X86-SSE41-NEXT:    ret i1 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 32) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length48(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length48(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length48(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length48(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length48(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 48) nounwind
+  ret i32 %m
+}
+
+define i1 @length48_eq(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length48_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length48_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length48_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length48_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 48) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length48_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length48_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length48_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length48_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 48) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length48_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length48_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length48_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length48_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 48) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
+; X86-LABEL: define i1 @length48_eq_prefer128(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length48_eq_prefer128(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length48_eq_prefer128(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length48_eq_prefer128(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR2]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 48) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 48) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length48_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 48) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length48_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 48) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length48_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 48) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length48_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 48) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp ne i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 48) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length63(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length63(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length63(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length63(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length63(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 63) nounwind
+  ret i32 %m
+}
+
+define i1 @length63_eq(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length63_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length63_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length63_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length63_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 63) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length63_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length63_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length63_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length63_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length63_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 63) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length63_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length63_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length63_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length63_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length63_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 63) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 63) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length63_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length63_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 63) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length63_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 63) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length63_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 63) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length63_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 63) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 63) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length64(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length64(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length64(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length64(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length64(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 64) nounwind
+  ret i32 %m
+}
+
+define i1 @length64_eq(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length64_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length64_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length64_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length64_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 64) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length64_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length64_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length64_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length64_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 64) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length64_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length64_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length64_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length64_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 64) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 64) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length64_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 64) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length64_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 64) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length64_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 64) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length64_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 64) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 64) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length96(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length96(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length96(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length96(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length96(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 96) nounwind
+  ret i32 %m
+}
+
+define i1 @length96_eq(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length96_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length96_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length96_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length96_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 96) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length96_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length96_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length96_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length96_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length96_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 96) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length96_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length96_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length96_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length96_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length96_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 96) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 96) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length96_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length96_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 96) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length96_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 96) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length96_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 96) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length96_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 96) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 96) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length127(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length127(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length127(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length127(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length127(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 127) nounwind
+  ret i32 %m
+}
+
+define i1 @length127_eq(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length127_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length127_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length127_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length127_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 127) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length127_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length127_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length127_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length127_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length127_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 127) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length127_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length127_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length127_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length127_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length127_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 127) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 127) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length127_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length127_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 127) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length127_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 127) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length127_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 127) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length127_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 127) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 127) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length128(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length128(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length128(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length128(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length128(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 128) nounwind
+  ret i32 %m
+}
+
+define i1 @length128_eq(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length128_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length128_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length128_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length128_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 128) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length128_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length128_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length128_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length128_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length128_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 128) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length128_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length128_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length128_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length128_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length128_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 128) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 128) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length128_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length128_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 128) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length128_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 128) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length128_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 128) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length128_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 128) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 128) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length192(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length192(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length192(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length192(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length192(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 192) nounwind
+  ret i32 %m
+}
+
+define i1 @length192_eq(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length192_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length192_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length192_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length192_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 192) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length192_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length192_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length192_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length192_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length192_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 192) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length192_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length192_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length192_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length192_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length192_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 192) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 192) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length192_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length192_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 192) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length192_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 192) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length192_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 192) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length192_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 192) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 192) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length255(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length255(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length255(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length255(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length255(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 255) nounwind
+  ret i32 %m
+}
+
+define i1 @length255_eq(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length255_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length255_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length255_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length255_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 255) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length255_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length255_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length255_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length255_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length255_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 255) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length255_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length255_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length255_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length255_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length255_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 255) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 255) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length255_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length255_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 255) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length255_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 255) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length255_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 255) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length255_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 255) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 255) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length256(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length256(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length256(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length256(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length256(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 256) nounwind
+  ret i32 %m
+}
+
+define i1 @length256_eq(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length256_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length256_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length256_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length256_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 256) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length256_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length256_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length256_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length256_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length256_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 256) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length256_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length256_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length256_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length256_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length256_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 256) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 256) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length256_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length256_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 256) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length256_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 256) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length256_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 256) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length256_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 256) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 256) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length384(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length384(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length384(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length384(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length384(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 384) nounwind
+  ret i32 %m
+}
+
+define i1 @length384_eq(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length384_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length384_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length384_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length384_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 384) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length384_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length384_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length384_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length384_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length384_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 384) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length384_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length384_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length384_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length384_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length384_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 384) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 384) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length384_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length384_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 384) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length384_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 384) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length384_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 384) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length384_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 384) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 384) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length511(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length511(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length511(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length511(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length511(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 511) nounwind
+  ret i32 %m
+}
+
+define i1 @length511_eq(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length511_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length511_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length511_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length511_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 511) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length511_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length511_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length511_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length511_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length511_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 511) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length511_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length511_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length511_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length511_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length511_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 511) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 511) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length511_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length511_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 511) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length511_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 511) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length511_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 511) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length511_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 511) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 511) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length512(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @length512(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @length512(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @length512(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @length512(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 512) nounwind
+  ret i32 %m
+}
+
+define i1 @length512_eq(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length512_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length512_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length512_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length512_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 512) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length512_lt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length512_lt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length512_lt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length512_lt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length512_lt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 512) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length512_gt(ptr %x, ptr %y) nounwind {
+; X86-LABEL: define i1 @length512_gt(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE1-LABEL: define i1 @length512_gt(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE1-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE2-LABEL: define i1 @length512_gt(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE2-NEXT:    ret i1 [[CMP]]
+;
+; X86-SSE41-LABEL: define i1 @length512_gt(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 512) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CALL]], 0
+; X86-SSE41-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 512) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length512_eq_const(ptr %X) nounwind {
+; X86-LABEL: define i1 @length512_eq_const(
+; X86-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 512) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @length512_eq_const(
+; X86-SSE1-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 512) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @length512_eq_const(
+; X86-SSE2-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 512) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @length512_eq_const(
+; X86-SSE41-SAME: ptr [[X:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr @.str, i32 512) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 512) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; This checks that we do not do stupid things with huge sizes.
+define i32 @huge_length(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i32 @huge_length(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 -1) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @huge_length(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 -1) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @huge_length(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 -1) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @huge_length(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 -1) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 9223372036854775807) nounwind
+  ret i32 %m
+}
+
+define i1 @huge_length_eq(ptr %X, ptr %Y) nounwind {
+; X86-LABEL: define i1 @huge_length_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 -1) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @huge_length_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 -1) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @huge_length_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 -1) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @huge_length_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 -1) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 9223372036854775807) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; This checks non-constant sizes.
+define i32 @nonconst_length(ptr %X, ptr %Y, i32 %size) nounwind {
+; X86-LABEL: define i32 @nonconst_length(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i32 [[SIZE:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 [[SIZE]]) #[[ATTR5]]
+; X86-NEXT:    ret i32 [[M]]
+;
+; X86-SSE1-LABEL: define i32 @nonconst_length(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i32 [[SIZE:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 [[SIZE]]) #[[ATTR5]]
+; X86-SSE1-NEXT:    ret i32 [[M]]
+;
+; X86-SSE2-LABEL: define i32 @nonconst_length(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i32 [[SIZE:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 [[SIZE]]) #[[ATTR5]]
+; X86-SSE2-NEXT:    ret i32 [[M]]
+;
+; X86-SSE41-LABEL: define i32 @nonconst_length(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i32 [[SIZE:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 [[SIZE]]) #[[ATTR5]]
+; X86-SSE41-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 %size) nounwind
+  ret i32 %m
+}
+
+define i1 @nonconst_length_eq(ptr %X, ptr %Y, i32 %size) nounwind {
+; X86-LABEL: define i1 @nonconst_length_eq(
+; X86-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i32 [[SIZE:%.*]]) #[[ATTR1]] {
+; X86-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 [[SIZE]]) #[[ATTR5]]
+; X86-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-NEXT:    ret i1 [[C]]
+;
+; X86-SSE1-LABEL: define i1 @nonconst_length_eq(
+; X86-SSE1-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i32 [[SIZE:%.*]]) #[[ATTR1]] {
+; X86-SSE1-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 [[SIZE]]) #[[ATTR5]]
+; X86-SSE1-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE1-NEXT:    ret i1 [[C]]
+;
+; X86-SSE2-LABEL: define i1 @nonconst_length_eq(
+; X86-SSE2-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i32 [[SIZE:%.*]]) #[[ATTR1]] {
+; X86-SSE2-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 [[SIZE]]) #[[ATTR5]]
+; X86-SSE2-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE2-NEXT:    ret i1 [[C]]
+;
+; X86-SSE41-LABEL: define i1 @nonconst_length_eq(
+; X86-SSE41-SAME: ptr [[X:%.*]], ptr [[Y:%.*]], i32 [[SIZE:%.*]]) #[[ATTR1]] {
+; X86-SSE41-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 [[SIZE]]) #[[ATTR5]]
+; X86-SSE41-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; X86-SSE41-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 %size) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll
index d71ae8be19b668..5a0f4db363536d 100644
--- a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll
@@ -1,64 +1,66 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -expand-memcmp -mtriple=i686-unknown-unknown   -data-layout=e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128 < %s | FileCheck %s --check-prefix=X32
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt -S -passes=expand-memcmp -mtriple=i686-unknown-unknown   -data-layout=e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128 < %s | FileCheck %s --check-prefix=X32
 
 declare i32 @memcmp(ptr nocapture, ptr nocapture, i32)
 
 define i32 @cmp2(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp2(
-; X32-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X:%.*]], align 1
-; X32-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y:%.*]], align 1
-; X32-NEXT:    [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
-; X32-NEXT:    [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
-; X32-NEXT:    [[TMP7:%.*]] = zext i16 [[TMP5]] to i32
-; X32-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i32
-; X32-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
-; X32-NEXT:    ret i32 [[TMP9]]
+; X32-LABEL: define i32 @cmp2(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X32-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X32-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X32-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X32-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X32-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X32-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X32-NEXT:    ret i32 [[TMP7]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 2)
   ret i32 %call
 }
 
 define i32 @cmp2_align2(ptr nocapture readonly align 2 %x, ptr nocapture readonly align 2 %y)  {
-; X32-LABEL: @cmp2_align2(
-; X32-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X:%.*]], align 2
-; X32-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y:%.*]], align 2
-; X32-NEXT:    [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
-; X32-NEXT:    [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
-; X32-NEXT:    [[TMP7:%.*]] = zext i16 [[TMP5]] to i32
-; X32-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i32
-; X32-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
-; X32-NEXT:    ret i32 [[TMP9]]
+; X32-LABEL: define i32 @cmp2_align2(
+; X32-SAME: ptr nocapture readonly align 2 [[X:%.*]], ptr nocapture readonly align 2 [[Y:%.*]]) {
+; X32-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 2
+; X32-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 2
+; X32-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X32-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X32-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X32-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X32-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X32-NEXT:    ret i32 [[TMP7]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 2)
   ret i32 %call
 }
 
 define i32 @cmp3(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp3(
+; X32-LABEL: define i32 @cmp3(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X32-NEXT:    br label [[LOADBB:%.*]]
 ; X32:       res_block:
-; X32-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP7:%.*]], [[TMP8:%.*]]
+; X32-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
 ; X32-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X32-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X32:       loadbb:
-; X32-NEXT:    [[TMP5:%.*]] = load i16, ptr [[X:%.*]], align 1
-; X32-NEXT:    [[TMP6:%.*]] = load i16, ptr [[Y:%.*]], align 1
-; X32-NEXT:    [[TMP7]] = call i16 @llvm.bswap.i16(i16 [[TMP5]])
-; X32-NEXT:    [[TMP8]] = call i16 @llvm.bswap.i16(i16 [[TMP6]])
-; X32-NEXT:    [[TMP9:%.*]] = icmp eq i16 [[TMP7]], [[TMP8]]
-; X32-NEXT:    br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X32-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X32-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X32-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X32-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X32-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X32-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
 ; X32:       loadbb1:
-; X32-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 2
-; X32-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 2
-; X32-NEXT:    [[TMP12:%.*]] = load i8, ptr [[TMP10]], align 1
-; X32-NEXT:    [[TMP13:%.*]] = load i8, ptr [[TMP11]], align 1
-; X32-NEXT:    [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
-; X32-NEXT:    [[TMP15:%.*]] = zext i8 [[TMP13]] to i32
-; X32-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
+; X32-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X32-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X32-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X32-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X32-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X32-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X32-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
 ; X32-NEXT:    br label [[ENDBLOCK]]
 ; X32:       endblock:
-; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X32-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 3)
@@ -66,47 +68,49 @@ define i32 @cmp3(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp4(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp4(
-; X32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X32-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
-; X32-NEXT:    [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
-; X32-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP5]], [[TMP6]]
-; X32-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP5]], [[TMP6]]
-; X32-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
-; X32-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
-; X32-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
-; X32-NEXT:    ret i32 [[TMP11]]
+; X32-LABEL: define i32 @cmp4(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X32-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X32-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X32-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X32-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X32-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X32-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X32-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X32-NEXT:    ret i32 [[TMP9]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 4)
   ret i32 %call
 }
 
 define i32 @cmp5(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp5(
+; X32-LABEL: define i32 @cmp5(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X32-NEXT:    br label [[LOADBB:%.*]]
 ; X32:       res_block:
-; X32-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP7:%.*]], [[TMP8:%.*]]
+; X32-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
 ; X32-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X32-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X32:       loadbb:
-; X32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X32-NEXT:    [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
-; X32-NEXT:    [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])
-; X32-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
-; X32-NEXT:    br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X32-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X32-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X32-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X32-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
 ; X32:       loadbb1:
-; X32-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X32-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X32-NEXT:    [[TMP12:%.*]] = load i8, ptr [[TMP10]], align 1
-; X32-NEXT:    [[TMP13:%.*]] = load i8, ptr [[TMP11]], align 1
-; X32-NEXT:    [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
-; X32-NEXT:    [[TMP15:%.*]] = zext i8 [[TMP13]] to i32
-; X32-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
+; X32-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X32-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X32-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X32-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X32-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X32-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X32-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
 ; X32-NEXT:    br label [[ENDBLOCK]]
 ; X32:       endblock:
-; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X32-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 5)
@@ -114,32 +118,33 @@ define i32 @cmp5(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp6(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp6(
+; X32-LABEL: define i32 @cmp6(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X32-NEXT:    br label [[LOADBB:%.*]]
 ; X32:       res_block:
-; X32-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ]
-; X32-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ]
+; X32-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X32-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
 ; X32-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
 ; X32-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X32-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X32:       loadbb:
-; X32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X32-NEXT:    [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
-; X32-NEXT:    [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])
-; X32-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
-; X32-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X32-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X32-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X32-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X32-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X32:       loadbb1:
-; X32-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X32-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X32-NEXT:    [[TMP14:%.*]] = load i16, ptr [[TMP10]], align 1
-; X32-NEXT:    [[TMP15:%.*]] = load i16, ptr [[TMP11]], align 1
-; X32-NEXT:    [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]])
-; X32-NEXT:    [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]])
-; X32-NEXT:    [[TMP18]] = zext i16 [[TMP16]] to i32
-; X32-NEXT:    [[TMP19]] = zext i16 [[TMP17]] to i32
-; X32-NEXT:    [[TMP20:%.*]] = icmp eq i32 [[TMP18]], [[TMP19]]
-; X32-NEXT:    br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X32-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X32-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X32-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP8]], align 1
+; X32-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP9]], align 1
+; X32-NEXT:    [[TMP12:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP10]])
+; X32-NEXT:    [[TMP13:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP11]])
+; X32-NEXT:    [[TMP14]] = zext i16 [[TMP12]] to i32
+; X32-NEXT:    [[TMP15]] = zext i16 [[TMP13]] to i32
+; X32-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP14]], [[TMP15]]
+; X32-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
 ; X32:       endblock:
 ; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X32-NEXT:    ret i32 [[PHI_RES]]
@@ -149,30 +154,31 @@ define i32 @cmp6(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp7(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp7(
+; X32-LABEL: define i32 @cmp7(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X32-NEXT:    br label [[LOADBB:%.*]]
 ; X32:       res_block:
-; X32-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ]
-; X32-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X32-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X32-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
 ; X32-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
 ; X32-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X32-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X32:       loadbb:
-; X32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X32-NEXT:    [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
-; X32-NEXT:    [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])
-; X32-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
-; X32-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X32-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X32-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X32-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X32-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X32:       loadbb1:
-; X32-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 3
-; X32-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 3
-; X32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP10]], align 1
-; X32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP11]], align 1
-; X32-NEXT:    [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
-; X32-NEXT:    [[TMP17]] = call i32 @llvm.bswap.i32(i32 [[TMP15]])
-; X32-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[TMP16]], [[TMP17]]
-; X32-NEXT:    br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X32-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X32-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X32-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X32-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X32-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X32-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
 ; X32:       endblock:
 ; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X32-NEXT:    ret i32 [[PHI_RES]]
@@ -182,30 +188,31 @@ define i32 @cmp7(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp8(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp8(
+; X32-LABEL: define i32 @cmp8(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X32-NEXT:    br label [[LOADBB:%.*]]
 ; X32:       res_block:
-; X32-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ]
-; X32-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X32-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X32-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
 ; X32-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
 ; X32-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X32-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X32:       loadbb:
-; X32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X32-NEXT:    [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
-; X32-NEXT:    [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])
-; X32-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
-; X32-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X32-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X32-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X32-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X32-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X32:       loadbb1:
-; X32-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X32-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP10]], align 1
-; X32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP11]], align 1
-; X32-NEXT:    [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
-; X32-NEXT:    [[TMP17]] = call i32 @llvm.bswap.i32(i32 [[TMP15]])
-; X32-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[TMP16]], [[TMP17]]
-; X32-NEXT:    br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X32-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X32-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X32-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X32-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X32-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X32-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
 ; X32:       endblock:
 ; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X32-NEXT:    ret i32 [[PHI_RES]]
@@ -215,8 +222,9 @@ define i32 @cmp8(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp9(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp9(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X:%.*]], ptr [[Y:%.*]], i32 9)
+; X32-LABEL: define i32 @cmp9(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 9)
 ; X32-NEXT:    ret i32 [[CALL]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 9)
@@ -224,8 +232,9 @@ define i32 @cmp9(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp10(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp10(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X:%.*]], ptr [[Y:%.*]], i32 10)
+; X32-LABEL: define i32 @cmp10(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 10)
 ; X32-NEXT:    ret i32 [[CALL]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 10)
@@ -233,8 +242,9 @@ define i32 @cmp10(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp11(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp11(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X:%.*]], ptr [[Y:%.*]], i32 11)
+; X32-LABEL: define i32 @cmp11(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 11)
 ; X32-NEXT:    ret i32 [[CALL]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 11)
@@ -242,8 +252,9 @@ define i32 @cmp11(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp12(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp12(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X:%.*]], ptr [[Y:%.*]], i32 12)
+; X32-LABEL: define i32 @cmp12(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 12)
 ; X32-NEXT:    ret i32 [[CALL]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 12)
@@ -251,8 +262,9 @@ define i32 @cmp12(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp13(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp13(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X:%.*]], ptr [[Y:%.*]], i32 13)
+; X32-LABEL: define i32 @cmp13(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 13)
 ; X32-NEXT:    ret i32 [[CALL]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 13)
@@ -260,8 +272,9 @@ define i32 @cmp13(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp14(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp14(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X:%.*]], ptr [[Y:%.*]], i32 14)
+; X32-LABEL: define i32 @cmp14(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 14)
 ; X32-NEXT:    ret i32 [[CALL]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 14)
@@ -269,8 +282,9 @@ define i32 @cmp14(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp15(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp15(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X:%.*]], ptr [[Y:%.*]], i32 15)
+; X32-LABEL: define i32 @cmp15(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 15)
 ; X32-NEXT:    ret i32 [[CALL]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 15)
@@ -278,8 +292,9 @@ define i32 @cmp15(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp16(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp16(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X:%.*]], ptr [[Y:%.*]], i32 16)
+; X32-LABEL: define i32 @cmp16(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16)
 ; X32-NEXT:    ret i32 [[CALL]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 16)
@@ -287,12 +302,13 @@ define i32 @cmp16(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq2(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq2(
-; X32-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X:%.*]], align 1
-; X32-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y:%.*]], align 1
-; X32-NEXT:    [[TMP5:%.*]] = icmp ne i16 [[TMP3]], [[TMP4]]
-; X32-NEXT:    [[TMP6:%.*]] = zext i1 [[TMP5]] to i32
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0
+; X32-LABEL: define i32 @cmp_eq2(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X32-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X32-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X32-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
 ; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X32-NEXT:    ret i32 [[CONV]]
 ;
@@ -303,21 +319,22 @@ define i32 @cmp_eq2(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq3(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq3(
-; X32-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X:%.*]], align 1
-; X32-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y:%.*]], align 1
-; X32-NEXT:    [[TMP5:%.*]] = xor i16 [[TMP3]], [[TMP4]]
-; X32-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 2
-; X32-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 2
-; X32-NEXT:    [[TMP8:%.*]] = load i8, ptr [[TMP6]], align 1
-; X32-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
-; X32-NEXT:    [[TMP10:%.*]] = zext i8 [[TMP8]] to i16
-; X32-NEXT:    [[TMP11:%.*]] = zext i8 [[TMP9]] to i16
-; X32-NEXT:    [[TMP12:%.*]] = xor i16 [[TMP10]], [[TMP11]]
-; X32-NEXT:    [[TMP13:%.*]] = or i16 [[TMP5]], [[TMP12]]
-; X32-NEXT:    [[TMP14:%.*]] = icmp ne i16 [[TMP13]], 0
-; X32-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
+; X32-LABEL: define i32 @cmp_eq3(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X32-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X32-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X32-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X32-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X32-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X32-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X32-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X32-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X32-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X32-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X32-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X32-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
 ; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X32-NEXT:    ret i32 [[CONV]]
 ;
@@ -328,12 +345,13 @@ define i32 @cmp_eq3(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq4(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq4(
-; X32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X32-NEXT:    [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]]
-; X32-NEXT:    [[TMP6:%.*]] = zext i1 [[TMP5]] to i32
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0
+; X32-LABEL: define i32 @cmp_eq4(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X32-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X32-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
 ; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X32-NEXT:    ret i32 [[CONV]]
 ;
@@ -344,21 +362,22 @@ define i32 @cmp_eq4(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq5(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq5(
-; X32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X32-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
-; X32-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X32-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X32-NEXT:    [[TMP8:%.*]] = load i8, ptr [[TMP6]], align 1
-; X32-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
-; X32-NEXT:    [[TMP10:%.*]] = zext i8 [[TMP8]] to i32
-; X32-NEXT:    [[TMP11:%.*]] = zext i8 [[TMP9]] to i32
-; X32-NEXT:    [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]]
-; X32-NEXT:    [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]]
-; X32-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
-; X32-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
+; X32-LABEL: define i32 @cmp_eq5(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X32-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X32-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X32-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X32-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X32-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X32-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X32-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X32-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X32-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X32-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X32-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
 ; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X32-NEXT:    ret i32 [[CONV]]
 ;
@@ -369,21 +388,22 @@ define i32 @cmp_eq5(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq6(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq6(
-; X32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X32-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
-; X32-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X32-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X32-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP6]], align 1
-; X32-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP7]], align 1
-; X32-NEXT:    [[TMP12:%.*]] = zext i16 [[TMP10]] to i32
-; X32-NEXT:    [[TMP13:%.*]] = zext i16 [[TMP11]] to i32
-; X32-NEXT:    [[TMP14:%.*]] = xor i32 [[TMP12]], [[TMP13]]
-; X32-NEXT:    [[TMP15:%.*]] = or i32 [[TMP5]], [[TMP14]]
-; X32-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
-; X32-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X32-LABEL: define i32 @cmp_eq6(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X32-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X32-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X32-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X32-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X32-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X32-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i32
+; X32-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i32
+; X32-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X32-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X32-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X32-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
 ; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X32-NEXT:    ret i32 [[CONV]]
 ;
@@ -394,21 +414,22 @@ define i32 @cmp_eq6(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq6_align4(ptr nocapture readonly align 4 %x, ptr nocapture readonly align 4 %y)  {
-; X32-LABEL: @cmp_eq6_align4(
-; X32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 4
-; X32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 4
-; X32-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
-; X32-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X32-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X32-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP6]], align 4
-; X32-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP7]], align 4
-; X32-NEXT:    [[TMP12:%.*]] = zext i16 [[TMP10]] to i32
-; X32-NEXT:    [[TMP13:%.*]] = zext i16 [[TMP11]] to i32
-; X32-NEXT:    [[TMP14:%.*]] = xor i32 [[TMP12]], [[TMP13]]
-; X32-NEXT:    [[TMP15:%.*]] = or i32 [[TMP5]], [[TMP14]]
-; X32-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
-; X32-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X32-LABEL: define i32 @cmp_eq6_align4(
+; X32-SAME: ptr nocapture readonly align 4 [[X:%.*]], ptr nocapture readonly align 4 [[Y:%.*]]) {
+; X32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 4
+; X32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 4
+; X32-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X32-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X32-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X32-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 4
+; X32-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 4
+; X32-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i32
+; X32-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i32
+; X32-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X32-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X32-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X32-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
 ; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X32-NEXT:    ret i32 [[CONV]]
 ;
@@ -419,19 +440,20 @@ define i32 @cmp_eq6_align4(ptr nocapture readonly align 4 %x, ptr nocapture read
 }
 
 define i32 @cmp_eq7(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq7(
-; X32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X32-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
-; X32-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 3
-; X32-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 3
-; X32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 1
-; X32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 1
-; X32-NEXT:    [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]]
-; X32-NEXT:    [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]]
-; X32-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
-; X32-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
+; X32-LABEL: define i32 @cmp_eq7(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X32-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X32-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X32-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X32-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X32-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X32-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X32-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
 ; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X32-NEXT:    ret i32 [[CONV]]
 ;
@@ -442,19 +464,20 @@ define i32 @cmp_eq7(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq8(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq8(
-; X32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X32-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
-; X32-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X32-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 1
-; X32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 1
-; X32-NEXT:    [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]]
-; X32-NEXT:    [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]]
-; X32-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
-; X32-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
+; X32-LABEL: define i32 @cmp_eq8(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X32-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X32-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X32-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X32-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X32-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X32-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X32-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
 ; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X32-NEXT:    ret i32 [[CONV]]
 ;
@@ -465,8 +488,9 @@ define i32 @cmp_eq8(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq9(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq9(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X:%.*]], ptr [[Y:%.*]], i32 9)
+; X32-LABEL: define i32 @cmp_eq9(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 9)
 ; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
 ; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X32-NEXT:    ret i32 [[CONV]]
@@ -478,8 +502,9 @@ define i32 @cmp_eq9(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq10(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq10(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X:%.*]], ptr [[Y:%.*]], i32 10)
+; X32-LABEL: define i32 @cmp_eq10(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 10)
 ; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
 ; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X32-NEXT:    ret i32 [[CONV]]
@@ -491,8 +516,9 @@ define i32 @cmp_eq10(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq11(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq11(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X:%.*]], ptr [[Y:%.*]], i32 11)
+; X32-LABEL: define i32 @cmp_eq11(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 11)
 ; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
 ; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X32-NEXT:    ret i32 [[CONV]]
@@ -504,8 +530,9 @@ define i32 @cmp_eq11(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq12(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq12(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X:%.*]], ptr [[Y:%.*]], i32 12)
+; X32-LABEL: define i32 @cmp_eq12(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 12)
 ; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
 ; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X32-NEXT:    ret i32 [[CONV]]
@@ -517,8 +544,9 @@ define i32 @cmp_eq12(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq13(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq13(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X:%.*]], ptr [[Y:%.*]], i32 13)
+; X32-LABEL: define i32 @cmp_eq13(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 13)
 ; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
 ; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X32-NEXT:    ret i32 [[CONV]]
@@ -530,8 +558,9 @@ define i32 @cmp_eq13(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq14(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq14(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X:%.*]], ptr [[Y:%.*]], i32 14)
+; X32-LABEL: define i32 @cmp_eq14(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 14)
 ; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
 ; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X32-NEXT:    ret i32 [[CONV]]
@@ -543,8 +572,9 @@ define i32 @cmp_eq14(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq15(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq15(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X:%.*]], ptr [[Y:%.*]], i32 15)
+; X32-LABEL: define i32 @cmp_eq15(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 15)
 ; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
 ; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X32-NEXT:    ret i32 [[CONV]]
@@ -556,8 +586,9 @@ define i32 @cmp_eq15(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq16(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq16(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X:%.*]], ptr [[Y:%.*]], i32 16)
+; X32-LABEL: define i32 @cmp_eq16(
+; X32-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i32 16)
 ; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
 ; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X32-NEXT:    ret i32 [[CONV]]
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll
index f686e29975564f..99100aad3ee84a 100644
--- a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll
@@ -1,66 +1,67 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -expand-memcmp -memcmp-num-loads-per-block=1 -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128         < %s | FileCheck %s --check-prefix=X64 --check-prefix=X64_1LD
-; RUN: opt -S -expand-memcmp -memcmp-num-loads-per-block=2 -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128         < %s | FileCheck %s --check-prefix=X64 --check-prefix=X64_2LD
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt -S -passes=expand-memcmp -memcmp-num-loads-per-block=1 -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128         < %s | FileCheck %s --check-prefix=X64 --check-prefix=X64_1LD
 ; RUN: opt -S -passes=expand-memcmp -memcmp-num-loads-per-block=2 -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128         < %s | FileCheck %s --check-prefix=X64 --check-prefix=X64_2LD
 
 declare i32 @memcmp(ptr nocapture, ptr nocapture, i64)
 
 define i32 @cmp2(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64-LABEL: @cmp2(
-; X64-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
-; X64-NEXT:    [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
-; X64-NEXT:    [[TMP7:%.*]] = zext i16 [[TMP5]] to i32
-; X64-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i32
-; X64-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
-; X64-NEXT:    ret i32 [[TMP9]]
+; X64-LABEL: define i32 @cmp2(
+; X64-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    ret i32 [[TMP7]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 2)
   ret i32 %call
 }
 
 define i32 @cmp2_align2(ptr nocapture readonly align 2 %x, ptr nocapture readonly align 2 %y)  {
-; X64-LABEL: @cmp2_align2(
-; X64-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X:%.*]], align 2
-; X64-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y:%.*]], align 2
-; X64-NEXT:    [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
-; X64-NEXT:    [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
-; X64-NEXT:    [[TMP7:%.*]] = zext i16 [[TMP5]] to i32
-; X64-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i32
-; X64-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
-; X64-NEXT:    ret i32 [[TMP9]]
+; X64-LABEL: define i32 @cmp2_align2(
+; X64-SAME: ptr nocapture readonly align 2 [[X:%.*]], ptr nocapture readonly align 2 [[Y:%.*]]) {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 2
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 2
+; X64-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; X64-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; X64-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    ret i32 [[TMP7]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 2)
   ret i32 %call
 }
 
 define i32 @cmp3(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64-LABEL: @cmp3(
+; X64-LABEL: define i32 @cmp3(
+; X64-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
-; X64-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP7:%.*]], [[TMP8:%.*]]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP5:%.*]], [[TMP6:%.*]]
 ; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X64-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64:       loadbb:
-; X64-NEXT:    [[TMP5:%.*]] = load i16, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP6:%.*]] = load i16, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP7]] = call i16 @llvm.bswap.i16(i16 [[TMP5]])
-; X64-NEXT:    [[TMP8]] = call i16 @llvm.bswap.i16(i16 [[TMP6]])
-; X64-NEXT:    [[TMP9:%.*]] = icmp eq i16 [[TMP7]], [[TMP8]]
-; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i16 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 2
-; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 2
-; X64-NEXT:    [[TMP12:%.*]] = load i8, ptr [[TMP10]], align 1
-; X64-NEXT:    [[TMP13:%.*]] = load i8, ptr [[TMP11]], align 1
-; X64-NEXT:    [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
-; X64-NEXT:    [[TMP15:%.*]] = zext i8 [[TMP13]] to i32
-; X64-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
 ; X64-NEXT:    br label [[ENDBLOCK]]
 ; X64:       endblock:
-; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 3)
@@ -68,47 +69,49 @@ define i32 @cmp3(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp4(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64-LABEL: @cmp4(
-; X64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
-; X64-NEXT:    [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
-; X64-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP5]], [[TMP6]]
-; X64-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP5]], [[TMP6]]
-; X64-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
-; X64-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
-; X64-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
-; X64-NEXT:    ret i32 [[TMP11]]
+; X64-LABEL: define i32 @cmp4(
+; X64-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-NEXT:    ret i32 [[TMP9]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 4)
   ret i32 %call
 }
 
 define i32 @cmp5(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64-LABEL: @cmp5(
+; X64-LABEL: define i32 @cmp5(
+; X64-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
-; X64-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP7:%.*]], [[TMP8:%.*]]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP5:%.*]], [[TMP6:%.*]]
 ; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X64-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64:       loadbb:
-; X64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
-; X64-NEXT:    [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])
-; X64-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
-; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X64-NEXT:    [[TMP12:%.*]] = load i8, ptr [[TMP10]], align 1
-; X64-NEXT:    [[TMP13:%.*]] = load i8, ptr [[TMP11]], align 1
-; X64-NEXT:    [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
-; X64-NEXT:    [[TMP15:%.*]] = zext i8 [[TMP13]] to i32
-; X64-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
 ; X64-NEXT:    br label [[ENDBLOCK]]
 ; X64:       endblock:
-; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 5)
@@ -116,32 +119,33 @@ define i32 @cmp5(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp6(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64-LABEL: @cmp6(
+; X64-LABEL: define i32 @cmp6(
+; X64-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
-; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ]
-; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
 ; X64-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
 ; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X64-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64:       loadbb:
-; X64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
-; X64-NEXT:    [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])
-; X64-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
-; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X64-NEXT:    [[TMP14:%.*]] = load i16, ptr [[TMP10]], align 1
-; X64-NEXT:    [[TMP15:%.*]] = load i16, ptr [[TMP11]], align 1
-; X64-NEXT:    [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]])
-; X64-NEXT:    [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]])
-; X64-NEXT:    [[TMP18]] = zext i16 [[TMP16]] to i32
-; X64-NEXT:    [[TMP19]] = zext i16 [[TMP17]] to i32
-; X64-NEXT:    [[TMP20:%.*]] = icmp eq i32 [[TMP18]], [[TMP19]]
-; X64-NEXT:    br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP10]])
+; X64-NEXT:    [[TMP13:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP11]])
+; X64-NEXT:    [[TMP14]] = zext i16 [[TMP12]] to i32
+; X64-NEXT:    [[TMP15]] = zext i16 [[TMP13]] to i32
+; X64-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP14]], [[TMP15]]
+; X64-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
 ; X64:       endblock:
 ; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X64-NEXT:    ret i32 [[PHI_RES]]
@@ -151,30 +155,31 @@ define i32 @cmp6(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp7(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64-LABEL: @cmp7(
+; X64-LABEL: define i32 @cmp7(
+; X64-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
-; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ]
-; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
 ; X64-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
 ; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X64-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64:       loadbb:
-; X64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
-; X64-NEXT:    [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])
-; X64-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
-; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 3
-; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 3
-; X64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP10]], align 1
-; X64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP11]], align 1
-; X64-NEXT:    [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
-; X64-NEXT:    [[TMP17]] = call i32 @llvm.bswap.i32(i32 [[TMP15]])
-; X64-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[TMP16]], [[TMP17]]
-; X64-NEXT:    br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
 ; X64:       endblock:
 ; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X64-NEXT:    ret i32 [[PHI_RES]]
@@ -184,47 +189,49 @@ define i32 @cmp7(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp8(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64-LABEL: @cmp8(
-; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
-; X64-NEXT:    [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
-; X64-NEXT:    [[TMP7:%.*]] = icmp ugt i64 [[TMP5]], [[TMP6]]
-; X64-NEXT:    [[TMP8:%.*]] = icmp ult i64 [[TMP5]], [[TMP6]]
-; X64-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
-; X64-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
-; X64-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
-; X64-NEXT:    ret i32 [[TMP11]]
+; X64-LABEL: define i32 @cmp8(
+; X64-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; X64-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; X64-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; X64-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; X64-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-NEXT:    ret i32 [[TMP9]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 8)
   ret i32 %call
 }
 
 define i32 @cmp9(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64-LABEL: @cmp9(
+; X64-LABEL: define i32 @cmp9(
+; X64-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
-; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP7:%.*]], [[TMP8:%.*]]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP5:%.*]], [[TMP6:%.*]]
 ; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X64-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64:       loadbb:
-; X64-NEXT:    [[TMP5:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
-; X64-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
-; X64-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
-; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 8
-; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 8
-; X64-NEXT:    [[TMP12:%.*]] = load i8, ptr [[TMP10]], align 1
-; X64-NEXT:    [[TMP13:%.*]] = load i8, ptr [[TMP11]], align 1
-; X64-NEXT:    [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
-; X64-NEXT:    [[TMP15:%.*]] = zext i8 [[TMP13]] to i32
-; X64-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; X64-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; X64-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
 ; X64-NEXT:    br label [[ENDBLOCK]]
 ; X64:       endblock:
-; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 9)
@@ -232,32 +239,33 @@ define i32 @cmp9(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp10(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64-LABEL: @cmp10(
+; X64-LABEL: define i32 @cmp10(
+; X64-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
-; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ]
-; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
 ; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
 ; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X64-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64:       loadbb:
-; X64-NEXT:    [[TMP5:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
-; X64-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
-; X64-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
-; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 8
-; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 8
-; X64-NEXT:    [[TMP14:%.*]] = load i16, ptr [[TMP10]], align 1
-; X64-NEXT:    [[TMP15:%.*]] = load i16, ptr [[TMP11]], align 1
-; X64-NEXT:    [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]])
-; X64-NEXT:    [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]])
-; X64-NEXT:    [[TMP18]] = zext i16 [[TMP16]] to i64
-; X64-NEXT:    [[TMP19]] = zext i16 [[TMP17]] to i64
-; X64-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[TMP18]], [[TMP19]]
-; X64-NEXT:    br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP10]])
+; X64-NEXT:    [[TMP13:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP11]])
+; X64-NEXT:    [[TMP14]] = zext i16 [[TMP12]] to i64
+; X64-NEXT:    [[TMP15]] = zext i16 [[TMP13]] to i64
+; X64-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
 ; X64:       endblock:
 ; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X64-NEXT:    ret i32 [[PHI_RES]]
@@ -267,30 +275,31 @@ define i32 @cmp10(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp11(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64-LABEL: @cmp11(
+; X64-LABEL: define i32 @cmp11(
+; X64-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
-; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ]
-; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
 ; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
 ; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X64-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64:       loadbb:
-; X64-NEXT:    [[TMP5:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
-; X64-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
-; X64-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
-; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 3
-; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 3
-; X64-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 1
-; X64-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 1
-; X64-NEXT:    [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]])
-; X64-NEXT:    [[TMP17]] = call i64 @llvm.bswap.i64(i64 [[TMP15]])
-; X64-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[TMP16]], [[TMP17]]
-; X64-NEXT:    br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
 ; X64:       endblock:
 ; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X64-NEXT:    ret i32 [[PHI_RES]]
@@ -300,32 +309,33 @@ define i32 @cmp11(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp12(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64-LABEL: @cmp12(
+; X64-LABEL: define i32 @cmp12(
+; X64-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
-; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ]
-; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
 ; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
 ; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X64-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64:       loadbb:
-; X64-NEXT:    [[TMP5:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
-; X64-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
-; X64-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
-; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 8
-; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 8
-; X64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP10]], align 1
-; X64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP11]], align 1
-; X64-NEXT:    [[TMP16:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
-; X64-NEXT:    [[TMP17:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP15]])
-; X64-NEXT:    [[TMP18]] = zext i32 [[TMP16]] to i64
-; X64-NEXT:    [[TMP19]] = zext i32 [[TMP17]] to i64
-; X64-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[TMP18]], [[TMP19]]
-; X64-NEXT:    br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; X64-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; X64-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; X64-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; X64-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; X64-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
 ; X64:       endblock:
 ; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X64-NEXT:    ret i32 [[PHI_RES]]
@@ -335,30 +345,31 @@ define i32 @cmp12(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp13(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64-LABEL: @cmp13(
+; X64-LABEL: define i32 @cmp13(
+; X64-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
-; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ]
-; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
 ; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
 ; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X64-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64:       loadbb:
-; X64-NEXT:    [[TMP5:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
-; X64-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
-; X64-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
-; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 5
-; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 5
-; X64-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 1
-; X64-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 1
-; X64-NEXT:    [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]])
-; X64-NEXT:    [[TMP17]] = call i64 @llvm.bswap.i64(i64 [[TMP15]])
-; X64-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[TMP16]], [[TMP17]]
-; X64-NEXT:    br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
 ; X64:       endblock:
 ; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X64-NEXT:    ret i32 [[PHI_RES]]
@@ -368,30 +379,31 @@ define i32 @cmp13(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp14(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64-LABEL: @cmp14(
+; X64-LABEL: define i32 @cmp14(
+; X64-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
-; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ]
-; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
 ; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
 ; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X64-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64:       loadbb:
-; X64-NEXT:    [[TMP5:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
-; X64-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
-; X64-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
-; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 6
-; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 6
-; X64-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 1
-; X64-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 1
-; X64-NEXT:    [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]])
-; X64-NEXT:    [[TMP17]] = call i64 @llvm.bswap.i64(i64 [[TMP15]])
-; X64-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[TMP16]], [[TMP17]]
-; X64-NEXT:    br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
 ; X64:       endblock:
 ; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X64-NEXT:    ret i32 [[PHI_RES]]
@@ -401,30 +413,31 @@ define i32 @cmp14(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp15(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64-LABEL: @cmp15(
+; X64-LABEL: define i32 @cmp15(
+; X64-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
-; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ]
-; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
 ; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
 ; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X64-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64:       loadbb:
-; X64-NEXT:    [[TMP5:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
-; X64-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
-; X64-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
-; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 7
-; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 7
-; X64-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 1
-; X64-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 1
-; X64-NEXT:    [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]])
-; X64-NEXT:    [[TMP17]] = call i64 @llvm.bswap.i64(i64 [[TMP15]])
-; X64-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[TMP16]], [[TMP17]]
-; X64-NEXT:    br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
 ; X64:       endblock:
 ; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X64-NEXT:    ret i32 [[PHI_RES]]
@@ -434,30 +447,31 @@ define i32 @cmp15(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp16(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64-LABEL: @cmp16(
+; X64-LABEL: define i32 @cmp16(
+; X64-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
-; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ]
-; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
 ; X64-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
 ; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
 ; X64-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64:       loadbb:
-; X64-NEXT:    [[TMP5:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
-; X64-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
-; X64-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
-; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; X64-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; X64-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
 ; X64:       loadbb1:
-; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[X]], i64 8
-; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Y]], i64 8
-; X64-NEXT:    [[TMP14:%.*]] = load i64, ptr [[TMP10]], align 1
-; X64-NEXT:    [[TMP15:%.*]] = load i64, ptr [[TMP11]], align 1
-; X64-NEXT:    [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]])
-; X64-NEXT:    [[TMP17]] = call i64 @llvm.bswap.i64(i64 [[TMP15]])
-; X64-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[TMP16]], [[TMP17]]
-; X64-NEXT:    br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; X64-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; X64-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; X64-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; X64-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; X64-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
 ; X64:       endblock:
 ; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
 ; X64-NEXT:    ret i32 [[PHI_RES]]
@@ -467,12 +481,13 @@ define i32 @cmp16(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq2(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64-LABEL: @cmp_eq2(
-; X64-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP5:%.*]] = icmp ne i16 [[TMP3]], [[TMP4]]
-; X64-NEXT:    [[TMP6:%.*]] = zext i1 [[TMP5]] to i32
-; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0
+; X64-LABEL: define i32 @cmp_eq2(
+; X64-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
 ; X64-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64-NEXT:    ret i32 [[CONV]]
 ;
@@ -483,43 +498,45 @@ define i32 @cmp_eq2(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq3(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64_1LD-LABEL: @cmp_eq3(
+; X64_1LD-LABEL: define i32 @cmp_eq3(
+; X64_1LD-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64_1LD-NEXT:    br label [[LOADBB:%.*]]
 ; X64_1LD:       res_block:
 ; X64_1LD-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64_1LD:       loadbb:
-; X64_1LD-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP5:%.*]] = icmp ne i16 [[TMP3]], [[TMP4]]
-; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64_1LD-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64_1LD-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64_1LD-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; X64_1LD-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
-; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 2
-; X64_1LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 2
-; X64_1LD-NEXT:    [[TMP8:%.*]] = load i8, ptr [[TMP6]], align 1
-; X64_1LD-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
-; X64_1LD-NEXT:    [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]]
-; X64_1LD-NEXT:    br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64_1LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64_1LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64_1LD-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64_1LD-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64_1LD-NEXT:    [[TMP8:%.*]] = icmp ne i8 [[TMP6]], [[TMP7]]
+; X64_1LD-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
 ; X64_1LD:       endblock:
 ; X64_1LD-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
 ; X64_1LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
 ; X64_1LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_1LD-NEXT:    ret i32 [[CONV]]
 ;
-; X64_2LD-LABEL: @cmp_eq3(
-; X64_2LD-NEXT:    [[TMP3:%.*]] = load i16, ptr [[X:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP4:%.*]] = load i16, ptr [[Y:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i16 [[TMP3]], [[TMP4]]
-; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 2
-; X64_2LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 2
-; X64_2LD-NEXT:    [[TMP8:%.*]] = load i8, ptr [[TMP6]], align 1
-; X64_2LD-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
-; X64_2LD-NEXT:    [[TMP10:%.*]] = zext i8 [[TMP8]] to i16
-; X64_2LD-NEXT:    [[TMP11:%.*]] = zext i8 [[TMP9]] to i16
-; X64_2LD-NEXT:    [[TMP12:%.*]] = xor i16 [[TMP10]], [[TMP11]]
-; X64_2LD-NEXT:    [[TMP13:%.*]] = or i16 [[TMP5]], [[TMP12]]
-; X64_2LD-NEXT:    [[TMP14:%.*]] = icmp ne i16 [[TMP13]], 0
-; X64_2LD-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
+; X64_2LD-LABEL: define i32 @cmp_eq3(
+; X64_2LD-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X64_2LD-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; X64_2LD-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; X64_2LD-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
+; X64_2LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; X64_2LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; X64_2LD-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64_2LD-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64_2LD-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; X64_2LD-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; X64_2LD-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; X64_2LD-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; X64_2LD-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; X64_2LD-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
 ; X64_2LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_2LD-NEXT:    ret i32 [[CONV]]
 ;
@@ -530,12 +547,13 @@ define i32 @cmp_eq3(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq4(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64-LABEL: @cmp_eq4(
-; X64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]]
-; X64-NEXT:    [[TMP6:%.*]] = zext i1 [[TMP5]] to i32
-; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0
+; X64-LABEL: define i32 @cmp_eq4(
+; X64-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
 ; X64-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64-NEXT:    ret i32 [[CONV]]
 ;
@@ -546,43 +564,45 @@ define i32 @cmp_eq4(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq5(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64_1LD-LABEL: @cmp_eq5(
+; X64_1LD-LABEL: define i32 @cmp_eq5(
+; X64_1LD-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64_1LD-NEXT:    br label [[LOADBB:%.*]]
 ; X64_1LD:       res_block:
 ; X64_1LD-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64_1LD:       loadbb:
-; X64_1LD-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]]
-; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64_1LD-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64_1LD-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64_1LD-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64_1LD-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
-; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X64_1LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X64_1LD-NEXT:    [[TMP8:%.*]] = load i8, ptr [[TMP6]], align 1
-; X64_1LD-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
-; X64_1LD-NEXT:    [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]]
-; X64_1LD-NEXT:    br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64_1LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64_1LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64_1LD-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64_1LD-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64_1LD-NEXT:    [[TMP8:%.*]] = icmp ne i8 [[TMP6]], [[TMP7]]
+; X64_1LD-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
 ; X64_1LD:       endblock:
 ; X64_1LD-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
 ; X64_1LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
 ; X64_1LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_1LD-NEXT:    ret i32 [[CONV]]
 ;
-; X64_2LD-LABEL: @cmp_eq5(
-; X64_2LD-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
-; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X64_2LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X64_2LD-NEXT:    [[TMP8:%.*]] = load i8, ptr [[TMP6]], align 1
-; X64_2LD-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
-; X64_2LD-NEXT:    [[TMP10:%.*]] = zext i8 [[TMP8]] to i32
-; X64_2LD-NEXT:    [[TMP11:%.*]] = zext i8 [[TMP9]] to i32
-; X64_2LD-NEXT:    [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]]
-; X64_2LD-NEXT:    [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]]
-; X64_2LD-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
-; X64_2LD-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
+; X64_2LD-LABEL: define i32 @cmp_eq5(
+; X64_2LD-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X64_2LD-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64_2LD-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64_2LD-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64_2LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64_2LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64_2LD-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64_2LD-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64_2LD-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; X64_2LD-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; X64_2LD-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64_2LD-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64_2LD-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64_2LD-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
 ; X64_2LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_2LD-NEXT:    ret i32 [[CONV]]
 ;
@@ -593,43 +613,45 @@ define i32 @cmp_eq5(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq6(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64_1LD-LABEL: @cmp_eq6(
+; X64_1LD-LABEL: define i32 @cmp_eq6(
+; X64_1LD-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64_1LD-NEXT:    br label [[LOADBB:%.*]]
 ; X64_1LD:       res_block:
 ; X64_1LD-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64_1LD:       loadbb:
-; X64_1LD-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]]
-; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64_1LD-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64_1LD-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64_1LD-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64_1LD-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
-; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X64_1LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X64_1LD-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP6]], align 1
-; X64_1LD-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP7]], align 1
-; X64_1LD-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP10]], [[TMP11]]
-; X64_1LD-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64_1LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64_1LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64_1LD-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64_1LD-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64_1LD-NEXT:    [[TMP8:%.*]] = icmp ne i16 [[TMP6]], [[TMP7]]
+; X64_1LD-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
 ; X64_1LD:       endblock:
 ; X64_1LD-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
 ; X64_1LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
 ; X64_1LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_1LD-NEXT:    ret i32 [[CONV]]
 ;
-; X64_2LD-LABEL: @cmp_eq6(
-; X64_2LD-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
-; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X64_2LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X64_2LD-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP6]], align 1
-; X64_2LD-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP7]], align 1
-; X64_2LD-NEXT:    [[TMP12:%.*]] = zext i16 [[TMP10]] to i32
-; X64_2LD-NEXT:    [[TMP13:%.*]] = zext i16 [[TMP11]] to i32
-; X64_2LD-NEXT:    [[TMP14:%.*]] = xor i32 [[TMP12]], [[TMP13]]
-; X64_2LD-NEXT:    [[TMP15:%.*]] = or i32 [[TMP5]], [[TMP14]]
-; X64_2LD-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
-; X64_2LD-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
-; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64_2LD-LABEL: define i32 @cmp_eq6(
+; X64_2LD-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X64_2LD-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64_2LD-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64_2LD-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64_2LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64_2LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64_2LD-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64_2LD-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64_2LD-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i32
+; X64_2LD-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i32
+; X64_2LD-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64_2LD-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64_2LD-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64_2LD-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
 ; X64_2LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_2LD-NEXT:    ret i32 [[CONV]]
 ;
@@ -640,43 +662,45 @@ define i32 @cmp_eq6(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq6_align4(ptr nocapture readonly align 4 %x, ptr nocapture readonly align 4 %y)  {
-; X64_1LD-LABEL: @cmp_eq6_align4(
+; X64_1LD-LABEL: define i32 @cmp_eq6_align4(
+; X64_1LD-SAME: ptr nocapture readonly align 4 [[X:%.*]], ptr nocapture readonly align 4 [[Y:%.*]]) {
 ; X64_1LD-NEXT:    br label [[LOADBB:%.*]]
 ; X64_1LD:       res_block:
 ; X64_1LD-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64_1LD:       loadbb:
-; X64_1LD-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 4
-; X64_1LD-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 4
-; X64_1LD-NEXT:    [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]]
-; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64_1LD-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 4
+; X64_1LD-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 4
+; X64_1LD-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64_1LD-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
-; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X64_1LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X64_1LD-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP6]], align 4
-; X64_1LD-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP7]], align 4
-; X64_1LD-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP10]], [[TMP11]]
-; X64_1LD-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64_1LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64_1LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64_1LD-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 4
+; X64_1LD-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 4
+; X64_1LD-NEXT:    [[TMP8:%.*]] = icmp ne i16 [[TMP6]], [[TMP7]]
+; X64_1LD-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
 ; X64_1LD:       endblock:
 ; X64_1LD-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
 ; X64_1LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
 ; X64_1LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_1LD-NEXT:    ret i32 [[CONV]]
 ;
-; X64_2LD-LABEL: @cmp_eq6_align4(
-; X64_2LD-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 4
-; X64_2LD-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 4
-; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
-; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; X64_2LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; X64_2LD-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP6]], align 4
-; X64_2LD-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP7]], align 4
-; X64_2LD-NEXT:    [[TMP12:%.*]] = zext i16 [[TMP10]] to i32
-; X64_2LD-NEXT:    [[TMP13:%.*]] = zext i16 [[TMP11]] to i32
-; X64_2LD-NEXT:    [[TMP14:%.*]] = xor i32 [[TMP12]], [[TMP13]]
-; X64_2LD-NEXT:    [[TMP15:%.*]] = or i32 [[TMP5]], [[TMP14]]
-; X64_2LD-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
-; X64_2LD-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
-; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64_2LD-LABEL: define i32 @cmp_eq6_align4(
+; X64_2LD-SAME: ptr nocapture readonly align 4 [[X:%.*]], ptr nocapture readonly align 4 [[Y:%.*]]) {
+; X64_2LD-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 4
+; X64_2LD-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 4
+; X64_2LD-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64_2LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; X64_2LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; X64_2LD-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 4
+; X64_2LD-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 4
+; X64_2LD-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i32
+; X64_2LD-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i32
+; X64_2LD-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; X64_2LD-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; X64_2LD-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; X64_2LD-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
 ; X64_2LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_2LD-NEXT:    ret i32 [[CONV]]
 ;
@@ -687,41 +711,43 @@ define i32 @cmp_eq6_align4(ptr nocapture readonly align 4 %x, ptr nocapture read
 }
 
 define i32 @cmp_eq7(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64_1LD-LABEL: @cmp_eq7(
+; X64_1LD-LABEL: define i32 @cmp_eq7(
+; X64_1LD-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64_1LD-NEXT:    br label [[LOADBB:%.*]]
 ; X64_1LD:       res_block:
 ; X64_1LD-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64_1LD:       loadbb:
-; X64_1LD-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]]
-; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64_1LD-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64_1LD-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64_1LD-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; X64_1LD-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
-; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 3
-; X64_1LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 3
-; X64_1LD-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 1
-; X64_1LD-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 1
-; X64_1LD-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP10]], [[TMP11]]
-; X64_1LD-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64_1LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64_1LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64_1LD-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64_1LD-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64_1LD-NEXT:    [[TMP8:%.*]] = icmp ne i32 [[TMP6]], [[TMP7]]
+; X64_1LD-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
 ; X64_1LD:       endblock:
 ; X64_1LD-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
 ; X64_1LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
 ; X64_1LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_1LD-NEXT:    ret i32 [[CONV]]
 ;
-; X64_2LD-LABEL: @cmp_eq7(
-; X64_2LD-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
-; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 3
-; X64_2LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 3
-; X64_2LD-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 1
-; X64_2LD-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 1
-; X64_2LD-NEXT:    [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]]
-; X64_2LD-NEXT:    [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]]
-; X64_2LD-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
-; X64_2LD-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
+; X64_2LD-LABEL: define i32 @cmp_eq7(
+; X64_2LD-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X64_2LD-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; X64_2LD-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; X64_2LD-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; X64_2LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64_2LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64_2LD-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64_2LD-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64_2LD-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; X64_2LD-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; X64_2LD-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; X64_2LD-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
 ; X64_2LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_2LD-NEXT:    ret i32 [[CONV]]
 ;
@@ -732,12 +758,13 @@ define i32 @cmp_eq7(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq8(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64-LABEL: @cmp_eq8(
-; X64-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
-; X64-NEXT:    [[TMP6:%.*]] = zext i1 [[TMP5]] to i32
-; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0
+; X64-LABEL: define i32 @cmp_eq8(
+; X64-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
 ; X64-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64-NEXT:    ret i32 [[CONV]]
 ;
@@ -748,43 +775,45 @@ define i32 @cmp_eq8(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq9(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64_1LD-LABEL: @cmp_eq9(
+; X64_1LD-LABEL: define i32 @cmp_eq9(
+; X64_1LD-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64_1LD-NEXT:    br label [[LOADBB:%.*]]
 ; X64_1LD:       res_block:
 ; X64_1LD-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64_1LD:       loadbb:
-; X64_1LD-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
-; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64_1LD-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64_1LD-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64_1LD-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64_1LD-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
-; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 8
-; X64_1LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 8
-; X64_1LD-NEXT:    [[TMP8:%.*]] = load i8, ptr [[TMP6]], align 1
-; X64_1LD-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
-; X64_1LD-NEXT:    [[TMP10:%.*]] = icmp ne i8 [[TMP8]], [[TMP9]]
-; X64_1LD-NEXT:    br i1 [[TMP10]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64_1LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64_1LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64_1LD-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64_1LD-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64_1LD-NEXT:    [[TMP8:%.*]] = icmp ne i8 [[TMP6]], [[TMP7]]
+; X64_1LD-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
 ; X64_1LD:       endblock:
 ; X64_1LD-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
 ; X64_1LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
 ; X64_1LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_1LD-NEXT:    ret i32 [[CONV]]
 ;
-; X64_2LD-LABEL: @cmp_eq9(
-; X64_2LD-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]]
-; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 8
-; X64_2LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 8
-; X64_2LD-NEXT:    [[TMP8:%.*]] = load i8, ptr [[TMP6]], align 1
-; X64_2LD-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
-; X64_2LD-NEXT:    [[TMP10:%.*]] = zext i8 [[TMP8]] to i64
-; X64_2LD-NEXT:    [[TMP11:%.*]] = zext i8 [[TMP9]] to i64
-; X64_2LD-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP10]], [[TMP11]]
-; X64_2LD-NEXT:    [[TMP13:%.*]] = or i64 [[TMP5]], [[TMP12]]
-; X64_2LD-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
-; X64_2LD-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
+; X64_2LD-LABEL: define i32 @cmp_eq9(
+; X64_2LD-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X64_2LD-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64_2LD-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64_2LD-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64_2LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64_2LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64_2LD-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; X64_2LD-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; X64_2LD-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; X64_2LD-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; X64_2LD-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64_2LD-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64_2LD-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64_2LD-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
 ; X64_2LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_2LD-NEXT:    ret i32 [[CONV]]
 ;
@@ -795,43 +824,45 @@ define i32 @cmp_eq9(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq10(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64_1LD-LABEL: @cmp_eq10(
+; X64_1LD-LABEL: define i32 @cmp_eq10(
+; X64_1LD-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64_1LD-NEXT:    br label [[LOADBB:%.*]]
 ; X64_1LD:       res_block:
 ; X64_1LD-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64_1LD:       loadbb:
-; X64_1LD-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
-; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64_1LD-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64_1LD-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64_1LD-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64_1LD-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
-; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 8
-; X64_1LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 8
-; X64_1LD-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP6]], align 1
-; X64_1LD-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP7]], align 1
-; X64_1LD-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP10]], [[TMP11]]
-; X64_1LD-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64_1LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64_1LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64_1LD-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64_1LD-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64_1LD-NEXT:    [[TMP8:%.*]] = icmp ne i16 [[TMP6]], [[TMP7]]
+; X64_1LD-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
 ; X64_1LD:       endblock:
 ; X64_1LD-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
 ; X64_1LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
 ; X64_1LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_1LD-NEXT:    ret i32 [[CONV]]
 ;
-; X64_2LD-LABEL: @cmp_eq10(
-; X64_2LD-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]]
-; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 8
-; X64_2LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 8
-; X64_2LD-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP6]], align 1
-; X64_2LD-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP7]], align 1
-; X64_2LD-NEXT:    [[TMP12:%.*]] = zext i16 [[TMP10]] to i64
-; X64_2LD-NEXT:    [[TMP13:%.*]] = zext i16 [[TMP11]] to i64
-; X64_2LD-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP12]], [[TMP13]]
-; X64_2LD-NEXT:    [[TMP15:%.*]] = or i64 [[TMP5]], [[TMP14]]
-; X64_2LD-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP15]], 0
-; X64_2LD-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
-; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64_2LD-LABEL: define i32 @cmp_eq10(
+; X64_2LD-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X64_2LD-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64_2LD-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64_2LD-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64_2LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64_2LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64_2LD-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; X64_2LD-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; X64_2LD-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; X64_2LD-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; X64_2LD-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64_2LD-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64_2LD-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64_2LD-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
 ; X64_2LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_2LD-NEXT:    ret i32 [[CONV]]
 ;
@@ -842,41 +873,43 @@ define i32 @cmp_eq10(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq11(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64_1LD-LABEL: @cmp_eq11(
+; X64_1LD-LABEL: define i32 @cmp_eq11(
+; X64_1LD-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64_1LD-NEXT:    br label [[LOADBB:%.*]]
 ; X64_1LD:       res_block:
 ; X64_1LD-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64_1LD:       loadbb:
-; X64_1LD-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
-; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64_1LD-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64_1LD-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64_1LD-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64_1LD-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
-; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 3
-; X64_1LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 3
-; X64_1LD-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 1
-; X64_1LD-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 1
-; X64_1LD-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP10]], [[TMP11]]
-; X64_1LD-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64_1LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64_1LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64_1LD-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64_1LD-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64_1LD-NEXT:    [[TMP8:%.*]] = icmp ne i64 [[TMP6]], [[TMP7]]
+; X64_1LD-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
 ; X64_1LD:       endblock:
 ; X64_1LD-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
 ; X64_1LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
 ; X64_1LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_1LD-NEXT:    ret i32 [[CONV]]
 ;
-; X64_2LD-LABEL: @cmp_eq11(
-; X64_2LD-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]]
-; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 3
-; X64_2LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 3
-; X64_2LD-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 1
-; X64_2LD-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 1
-; X64_2LD-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP10]], [[TMP11]]
-; X64_2LD-NEXT:    [[TMP13:%.*]] = or i64 [[TMP5]], [[TMP12]]
-; X64_2LD-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
-; X64_2LD-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
+; X64_2LD-LABEL: define i32 @cmp_eq11(
+; X64_2LD-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X64_2LD-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64_2LD-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64_2LD-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64_2LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; X64_2LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; X64_2LD-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64_2LD-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64_2LD-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64_2LD-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64_2LD-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64_2LD-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
 ; X64_2LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_2LD-NEXT:    ret i32 [[CONV]]
 ;
@@ -887,43 +920,45 @@ define i32 @cmp_eq11(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq12(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64_1LD-LABEL: @cmp_eq12(
+; X64_1LD-LABEL: define i32 @cmp_eq12(
+; X64_1LD-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64_1LD-NEXT:    br label [[LOADBB:%.*]]
 ; X64_1LD:       res_block:
 ; X64_1LD-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64_1LD:       loadbb:
-; X64_1LD-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
-; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64_1LD-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64_1LD-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64_1LD-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64_1LD-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
-; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 8
-; X64_1LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 8
-; X64_1LD-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 1
-; X64_1LD-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 1
-; X64_1LD-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP10]], [[TMP11]]
-; X64_1LD-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64_1LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64_1LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64_1LD-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64_1LD-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64_1LD-NEXT:    [[TMP8:%.*]] = icmp ne i32 [[TMP6]], [[TMP7]]
+; X64_1LD-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
 ; X64_1LD:       endblock:
 ; X64_1LD-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
 ; X64_1LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
 ; X64_1LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_1LD-NEXT:    ret i32 [[CONV]]
 ;
-; X64_2LD-LABEL: @cmp_eq12(
-; X64_2LD-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]]
-; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 8
-; X64_2LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 8
-; X64_2LD-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 1
-; X64_2LD-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 1
-; X64_2LD-NEXT:    [[TMP12:%.*]] = zext i32 [[TMP10]] to i64
-; X64_2LD-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP11]] to i64
-; X64_2LD-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP12]], [[TMP13]]
-; X64_2LD-NEXT:    [[TMP15:%.*]] = or i64 [[TMP5]], [[TMP14]]
-; X64_2LD-NEXT:    [[TMP16:%.*]] = icmp ne i64 [[TMP15]], 0
-; X64_2LD-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
-; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X64_2LD-LABEL: define i32 @cmp_eq12(
+; X64_2LD-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X64_2LD-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64_2LD-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64_2LD-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64_2LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; X64_2LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; X64_2LD-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X64_2LD-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; X64_2LD-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; X64_2LD-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; X64_2LD-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; X64_2LD-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; X64_2LD-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; X64_2LD-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
 ; X64_2LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_2LD-NEXT:    ret i32 [[CONV]]
 ;
@@ -934,41 +969,43 @@ define i32 @cmp_eq12(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq13(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64_1LD-LABEL: @cmp_eq13(
+; X64_1LD-LABEL: define i32 @cmp_eq13(
+; X64_1LD-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64_1LD-NEXT:    br label [[LOADBB:%.*]]
 ; X64_1LD:       res_block:
 ; X64_1LD-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64_1LD:       loadbb:
-; X64_1LD-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
-; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64_1LD-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64_1LD-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64_1LD-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64_1LD-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
-; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 5
-; X64_1LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 5
-; X64_1LD-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 1
-; X64_1LD-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 1
-; X64_1LD-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP10]], [[TMP11]]
-; X64_1LD-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64_1LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64_1LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64_1LD-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64_1LD-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64_1LD-NEXT:    [[TMP8:%.*]] = icmp ne i64 [[TMP6]], [[TMP7]]
+; X64_1LD-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
 ; X64_1LD:       endblock:
 ; X64_1LD-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
 ; X64_1LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
 ; X64_1LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_1LD-NEXT:    ret i32 [[CONV]]
 ;
-; X64_2LD-LABEL: @cmp_eq13(
-; X64_2LD-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]]
-; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 5
-; X64_2LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 5
-; X64_2LD-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 1
-; X64_2LD-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 1
-; X64_2LD-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP10]], [[TMP11]]
-; X64_2LD-NEXT:    [[TMP13:%.*]] = or i64 [[TMP5]], [[TMP12]]
-; X64_2LD-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
-; X64_2LD-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
+; X64_2LD-LABEL: define i32 @cmp_eq13(
+; X64_2LD-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X64_2LD-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64_2LD-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64_2LD-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64_2LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; X64_2LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; X64_2LD-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64_2LD-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64_2LD-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64_2LD-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64_2LD-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64_2LD-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
 ; X64_2LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_2LD-NEXT:    ret i32 [[CONV]]
 ;
@@ -979,41 +1016,43 @@ define i32 @cmp_eq13(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq14(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64_1LD-LABEL: @cmp_eq14(
+; X64_1LD-LABEL: define i32 @cmp_eq14(
+; X64_1LD-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64_1LD-NEXT:    br label [[LOADBB:%.*]]
 ; X64_1LD:       res_block:
 ; X64_1LD-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64_1LD:       loadbb:
-; X64_1LD-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
-; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64_1LD-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64_1LD-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64_1LD-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64_1LD-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
-; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 6
-; X64_1LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 6
-; X64_1LD-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 1
-; X64_1LD-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 1
-; X64_1LD-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP10]], [[TMP11]]
-; X64_1LD-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64_1LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64_1LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64_1LD-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64_1LD-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64_1LD-NEXT:    [[TMP8:%.*]] = icmp ne i64 [[TMP6]], [[TMP7]]
+; X64_1LD-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
 ; X64_1LD:       endblock:
 ; X64_1LD-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
 ; X64_1LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
 ; X64_1LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_1LD-NEXT:    ret i32 [[CONV]]
 ;
-; X64_2LD-LABEL: @cmp_eq14(
-; X64_2LD-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]]
-; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 6
-; X64_2LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 6
-; X64_2LD-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 1
-; X64_2LD-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 1
-; X64_2LD-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP10]], [[TMP11]]
-; X64_2LD-NEXT:    [[TMP13:%.*]] = or i64 [[TMP5]], [[TMP12]]
-; X64_2LD-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
-; X64_2LD-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
+; X64_2LD-LABEL: define i32 @cmp_eq14(
+; X64_2LD-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X64_2LD-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64_2LD-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64_2LD-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64_2LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; X64_2LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; X64_2LD-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64_2LD-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64_2LD-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64_2LD-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64_2LD-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64_2LD-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
 ; X64_2LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_2LD-NEXT:    ret i32 [[CONV]]
 ;
@@ -1024,41 +1063,43 @@ define i32 @cmp_eq14(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq15(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64_1LD-LABEL: @cmp_eq15(
+; X64_1LD-LABEL: define i32 @cmp_eq15(
+; X64_1LD-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
 ; X64_1LD-NEXT:    br label [[LOADBB:%.*]]
 ; X64_1LD:       res_block:
 ; X64_1LD-NEXT:    br label [[ENDBLOCK:%.*]]
 ; X64_1LD:       loadbb:
-; X64_1LD-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64_1LD-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP3]], [[TMP4]]
-; X64_1LD-NEXT:    br i1 [[TMP5]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; X64_1LD-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64_1LD-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64_1LD-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; X64_1LD-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
 ; X64_1LD:       loadbb1:
-; X64_1LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
-; X64_1LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 7
-; X64_1LD-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 1
-; X64_1LD-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 1
-; X64_1LD-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP10]], [[TMP11]]
-; X64_1LD-NEXT:    br i1 [[TMP12]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; X64_1LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64_1LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64_1LD-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64_1LD-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64_1LD-NEXT:    [[TMP8:%.*]] = icmp ne i64 [[TMP6]], [[TMP7]]
+; X64_1LD-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
 ; X64_1LD:       endblock:
 ; X64_1LD-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
 ; X64_1LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
 ; X64_1LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_1LD-NEXT:    ret i32 [[CONV]]
 ;
-; X64_2LD-LABEL: @cmp_eq15(
-; X64_2LD-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y:%.*]], align 1
-; X64_2LD-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP3]], [[TMP4]]
-; X64_2LD-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 7
-; X64_2LD-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 7
-; X64_2LD-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 1
-; X64_2LD-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 1
-; X64_2LD-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP10]], [[TMP11]]
-; X64_2LD-NEXT:    [[TMP13:%.*]] = or i64 [[TMP5]], [[TMP12]]
-; X64_2LD-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
-; X64_2LD-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
+; X64_2LD-LABEL: define i32 @cmp_eq15(
+; X64_2LD-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X64_2LD-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; X64_2LD-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; X64_2LD-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; X64_2LD-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; X64_2LD-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; X64_2LD-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; X64_2LD-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; X64_2LD-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; X64_2LD-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; X64_2LD-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; X64_2LD-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; X64_2LD-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
 ; X64_2LD-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64_2LD-NEXT:    ret i32 [[CONV]]
 ;
@@ -1069,12 +1110,13 @@ define i32 @cmp_eq15(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq16(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; X64-LABEL: @cmp_eq16(
-; X64-NEXT:    [[TMP3:%.*]] = load i128, ptr [[X:%.*]], align 1
-; X64-NEXT:    [[TMP4:%.*]] = load i128, ptr [[Y:%.*]], align 1
-; X64-NEXT:    [[TMP5:%.*]] = icmp ne i128 [[TMP3]], [[TMP4]]
-; X64-NEXT:    [[TMP6:%.*]] = zext i1 [[TMP5]] to i32
-; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0
+; X64-LABEL: define i32 @cmp_eq16(
+; X64-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; X64-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; X64-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; X64-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
 ; X64-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
 ; X64-NEXT:    ret i32 [[CONV]]
 ;
diff --git a/llvm/test/Transforms/PhaseOrdering/PowerPC/lit.local.cfg b/llvm/test/Transforms/PhaseOrdering/PowerPC/lit.local.cfg
new file mode 100644
index 00000000000000..dfb347e640e144
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/PowerPC/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'PowerPC' in config.root.targets:
+    config.unsupported = True
\ No newline at end of file
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/memcmp-early.ll b/llvm/test/Transforms/PhaseOrdering/X86/memcmp-early.ll
new file mode 100644
index 00000000000000..a62b17de08ee43
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/X86/memcmp-early.ll
@@ -0,0 +1,86 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -O2 -S -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
+
+ at s1 = internal global ptr @.str, align 8
+ at s2 = internal global ptr @.str.1, align 8
+ at s3 = internal global ptr @.str.2, align 8
+ at .str = private unnamed_addr constant [9 x i8] c"01234000\00", align 1
+ at .str.1 = private unnamed_addr constant [9 x i8] c"0123!000\00", align 1
+ at .str.2 = private unnamed_addr constant [9 x i8] c"0123?000\00", align 1
+
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local i32 @memcmp_same_prefix_consts(ptr noundef %x) #0 {
+; CHECK-LABEL: define dso_local noundef i32 @memcmp_same_prefix_consts(
+; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 [[TMP0]], 858927408
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP4]], 52
+; CHECK-NEXT:    [[TMP6:%.*]] = or i32 [[TMP5]], [[TMP1]]
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i32 [[TMP6]], 0
+; CHECK-NEXT:    br i1 [[DOTNOT]], label [[IF_END8:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i32 [[TMP4]], 33
+; CHECK-NEXT:    [[TMP8:%.*]] = or i32 [[TMP7]], [[TMP1]]
+; CHECK-NEXT:    [[DOTNOT3:%.*]] = icmp eq i32 [[TMP8]], 0
+; CHECK-NEXT:    br i1 [[DOTNOT3]], label [[IF_END8]], label [[IF_THEN3:%.*]]
+; CHECK:       if.then3:
+; CHECK-NEXT:    [[TMP9:%.*]] = xor i32 [[TMP4]], 63
+; CHECK-NEXT:    [[TMP10:%.*]] = or i32 [[TMP9]], [[TMP1]]
+; CHECK-NEXT:    [[DOTNOT4:%.*]] = icmp eq i32 [[TMP10]], 0
+; CHECK-NEXT:    br i1 [[DOTNOT4]], label [[IF_END8]], label [[RETURN:%.*]]
+; CHECK:       if.end8:
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ 0, [[IF_END8]] ], [ 42, [[IF_THEN3]] ]
+; CHECK-NEXT:    ret i32 [[RETVAL_0]]
+;
+entry:
+  %retval = alloca i32, align 4
+  %x.addr = alloca ptr, align 8
+  store ptr %x, ptr %x.addr, align 8
+  %0 = load ptr, ptr %x.addr, align 8
+  %1 = load ptr, ptr @s1, align 8
+  %call = call i32 @memcmp(ptr noundef %0, ptr noundef %1, i64 noundef 5) #2
+  %cmp = icmp ne i32 %call, 0
+  br i1 %cmp, label %if.then, label %if.end8
+
+if.then:                                          ; preds = %entry
+  %2 = load ptr, ptr %x.addr, align 8
+  %3 = load ptr, ptr @s2, align 8
+  %call1 = call i32 @memcmp(ptr noundef %2, ptr noundef %3, i64 noundef 5) #2
+  %cmp2 = icmp ne i32 %call1, 0
+  br i1 %cmp2, label %if.then3, label %if.end7
+
+if.then3:                                         ; preds = %if.then
+  %4 = load ptr, ptr %x.addr, align 8
+  %5 = load ptr, ptr @s3, align 8
+  %call4 = call i32 @memcmp(ptr noundef %4, ptr noundef %5, i64 noundef 5) #2
+  %cmp5 = icmp ne i32 %call4, 0
+  br i1 %cmp5, label %if.then6, label %if.end
+
+if.then6:                                         ; preds = %if.then3
+  store i32 42, ptr %retval, align 4
+  br label %return
+
+if.end:                                           ; preds = %if.then3
+  br label %if.end7
+
+if.end7:                                          ; preds = %if.end, %if.then
+  br label %if.end8
+
+if.end8:                                          ; preds = %if.end7, %entry
+  store i32 0, ptr %retval, align 4
+  br label %return
+
+return:                                           ; preds = %if.end8, %if.then6
+  %6 = load i32, ptr %retval, align 4
+  ret i32 %6
+}
+
+; Function Attrs: nounwind willreturn memory(read)
+declare i32 @memcmp(ptr noundef, ptr noundef, i64 noundef) #1
+
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/memcmp-mergeexpand.ll b/llvm/test/Transforms/PhaseOrdering/X86/memcmp-mergeexpand.ll
new file mode 100644
index 00000000000000..2de1f8576f631f
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/X86/memcmp-mergeexpand.ll
@@ -0,0 +1,62 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S --passes=mergeicmps,expand-memcmp -mtriple=i686-unknown-linux < %s | FileCheck %s --check-prefix=X86
+; RUN: opt -S --passes=mergeicmps,expand-memcmp -mtriple=x86_64-unknown-linux < %s | FileCheck %s --check-prefix=X64
+
+; This tests interaction between MergeICmp and ExpandMemCmp.
+
+%"struct.std::pair" = type { i32, i32 }
+
+define zeroext i1 @opeq1(
+; X86-LABEL: define zeroext i1 @opeq1(
+; X86-SAME: ptr nocapture readonly dereferenceable(8) [[A:%.*]], ptr nocapture readonly dereferenceable(8) [[B:%.*]]) local_unnamed_addr {
+; X86-NEXT:  "entry+land.rhs.i":
+; X86-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 1
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B]], align 1
+; X86-NEXT:    [[TMP2:%.*]] = xor i32 [[TMP0]], [[TMP1]]
+; X86-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 4
+; X86-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 4
+; X86-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 1
+; X86-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; X86-NEXT:    [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP6]]
+; X86-NEXT:    [[TMP8:%.*]] = or i32 [[TMP2]], [[TMP7]]
+; X86-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0
+; X86-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP9]] to i32
+; X86-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 0
+; X86-NEXT:    br label [[OPEQ1_EXIT:%.*]]
+; X86:       opeq1.exit:
+; X86-NEXT:    ret i1 [[TMP11]]
+;
+; X64-LABEL: define zeroext i1 @opeq1(
+; X64-SAME: ptr nocapture readonly dereferenceable(8) [[A:%.*]], ptr nocapture readonly dereferenceable(8) [[B:%.*]]) local_unnamed_addr {
+; X64-NEXT:  "entry+land.rhs.i":
+; X64-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A]], align 1
+; X64-NEXT:    [[TMP1:%.*]] = load i64, ptr [[B]], align 1
+; X64-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP0]], [[TMP1]]
+; X64-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; X64-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 0
+; X64-NEXT:    br label [[OPEQ1_EXIT:%.*]]
+; X64:       opeq1.exit:
+; X64-NEXT:    ret i1 [[TMP4]]
+;
+  %"struct.std::pair"* nocapture readonly dereferenceable(8) %a,
+  %"struct.std::pair"* nocapture readonly dereferenceable(8) %b) local_unnamed_addr #0 {
+entry:
+  %first.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 0
+  %0 = load i32, i32* %first.i, align 4
+  %first1.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 0
+  %1 = load i32, i32* %first1.i, align 4
+  %cmp.i = icmp eq i32 %0, %1
+  br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit
+
+land.rhs.i:
+  %second.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 1
+  %2 = load i32, i32* %second.i, align 4
+  %second2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 1
+  %3 = load i32, i32* %second2.i, align 4
+  %cmp3.i = icmp eq i32 %2, %3
+  br label %opeq1.exit
+
+opeq1.exit:
+  %4 = phi i1 [ false, %entry ], [ %cmp3.i, %land.rhs.i ]
+  ret i1 %4
+}
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/memcmp.ll b/llvm/test/Transforms/PhaseOrdering/X86/memcmp.ll
new file mode 100644
index 00000000000000..68dfacac5b5e12
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/X86/memcmp.ll
@@ -0,0 +1,856 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt  -O2 -S -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
+
+ at .str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1
+
+declare i32 @memcmp(ptr, ptr, i64)
+
+declare i32 @bcmp(ptr, ptr, i64)
+
+; Function Attrs: nounwind
+define i32 @length0(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define noundef i32 @length0(
+; CHECK-SAME: ptr nocapture readnone [[X:%.*]], ptr nocapture readnone [[Y:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    ret i32 0
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) #0
+  ret i32 %m
+}
+
+; Function Attrs: nounwind
+define i1 @length0_eq(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define noundef i1 @length0_eq(
+; CHECK-SAME: ptr nocapture readnone [[X:%.*]], ptr nocapture readnone [[Y:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:    ret i1 true
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) #0
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i1 @length0_lt(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define noundef i1 @length0_lt(
+; CHECK-SAME: ptr nocapture readnone [[X:%.*]], ptr nocapture readnone [[Y:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:    ret i1 false
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) #0
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i32 @length2(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i32 @length2(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = sub nsw i32 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    ret i32 [[TMP7]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) #0
+  ret i32 %m
+}
+
+; Function Attrs: nounwind
+define i1 @length2_eq(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i1 @length2_eq(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i16 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret i1 [[DOTNOT]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) #0
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i1 @length2_lt(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i1 @length2_lt(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i16 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) #0
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i1 @length2_gt(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i1 @length2_gt(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; CHECK-NEXT:    [[C:%.*]] = icmp ugt i16 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) #0
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i1 @length2_eq_const(ptr %X) #0 {
+; CHECK-LABEL: define i1 @length2_eq_const(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; CHECK-NEXT:    ret i1 [[TMP2]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i64 2) #0
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i1 @length2_eq_nobuiltin_attr(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) #1
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i32 @length3(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i32 @length3(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:  loadbb:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i16 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[TMP2]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[TMP0]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i16 [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i8 [[TMP9]] to i32
+; CHECK-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; CHECK-NEXT:    [[TMP13:%.*]] = sub nsw i32 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    br label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP13]], [[LOADBB1]] ], [ [[TMP6]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) #0
+  ret i32 %m
+}
+
+; Function Attrs: nounwind
+define i1 @length3_eq(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i1 @length3_eq(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i8 [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP8]] to i16
+; CHECK-NEXT:    [[TMP10:%.*]] = or i16 [[TMP3]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i16 [[TMP10]], 0
+; CHECK-NEXT:    ret i1 [[TMP11]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) #0
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i32 @length4(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i32 @length4(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; CHECK-NEXT:    [[DOTNEG:%.*]] = sext i1 [[TMP6]] to i32
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw i32 [[DOTNEG]], [[TMP7]]
+; CHECK-NEXT:    ret i32 [[TMP8]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) #0
+  ret i32 %m
+}
+
+; Function Attrs: nounwind
+define i1 @length4_eq(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i1 @length4_eq(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret i1 [[TMP3]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) #0
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i1 @length4_lt(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i1 @length4_lt(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    ret i1 [[TMP5]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) #0
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i1 @length4_gt(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i1 @length4_gt(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    ret i1 [[TMP5]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) #0
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i1 @length4_eq_const(ptr %X) #0 {
+; CHECK-LABEL: define i1 @length4_eq_const(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i32 [[TMP1]], 875770417
+; CHECK-NEXT:    ret i1 [[DOTNOT]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i64 4) #0
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i32 @length5(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i32 @length5(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:  loadbb:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[TMP2]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[TMP0]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i8 [[TMP9]] to i32
+; CHECK-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; CHECK-NEXT:    [[TMP13:%.*]] = sub nsw i32 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    br label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP13]], [[LOADBB1]] ], [ [[TMP6]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) #0
+  ret i32 %m
+}
+
+; Function Attrs: nounwind
+define i1 @length5_eq(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i1 @length5_eq(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i8 [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP8]] to i32
+; CHECK-NEXT:    [[TMP10:%.*]] = or i32 [[TMP3]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP10]], 0
+; CHECK-NEXT:    ret i1 [[TMP11]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) #0
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i1 @length5_lt(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i1 @length5_lt(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:  loadbb:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[TMP2]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[TMP0]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; CHECK-NEXT:    [[TMP8:%.*]] = load i8, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ult i8 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    br label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i1 [ [[TMP10]], [[LOADBB1]] ], [ [[TMP5]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i1 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) #0
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i1 @length7_eq(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i1 @length7_eq(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i32 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    ret i1 [[TMP9]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) #0
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i32 @length8(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i32 @length8(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; CHECK-NEXT:    [[DOTNEG:%.*]] = sext i1 [[TMP6]] to i32
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw i32 [[DOTNEG]], [[TMP7]]
+; CHECK-NEXT:    ret i32 [[TMP8]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) #0
+  ret i32 %m
+}
+
+; Function Attrs: nounwind
+define i1 @length8_eq(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i1 @length8_eq(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret i1 [[DOTNOT]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) #0
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i1 @length8_eq_const(ptr %X) #0 {
+; CHECK-LABEL: define i1 @length8_eq_const(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; CHECK-NEXT:    ret i1 [[TMP2]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 8) #0
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i1 @length9_eq(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i1 @length9_eq(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i8 [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = or i64 [[TMP3]], [[TMP9]]
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[TMP10]], 0
+; CHECK-NEXT:    ret i1 [[DOTNOT]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9) #0
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i1 @length10_eq(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i1 @length10_eq(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i16 [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = or i64 [[TMP3]], [[TMP9]]
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[TMP10]], 0
+; CHECK-NEXT:    ret i1 [[DOTNOT]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 10) #0
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i1 @length11_eq(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i1 @length11_eq(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[DOTNOT2:%.*]] = and i1 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    ret i1 [[DOTNOT2]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 11) #0
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i1 @length12_eq(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i1 @length12_eq(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = or i64 [[TMP3]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i64 [[TMP10]], 0
+; CHECK-NEXT:    ret i1 [[TMP11]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) #0
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i32 @length12(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i32 @length12(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:  loadbb:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[TMP0]])
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP2]], [[LOADBB:%.*]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP3]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[TMP9]])
+; CHECK-NEXT:    [[TMP12:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = zext i32 [[TMP11]] to i64
+; CHECK-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    br i1 [[TMP15]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP6]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) #0
+  ret i32 %m
+}
+
+; Function Attrs: nounwind
+define i1 @length13_eq(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i1 @length13_eq(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[DOTNOT2:%.*]] = and i1 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    ret i1 [[DOTNOT2]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 13) #0
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i1 @length14_eq(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i1 @length14_eq(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[DOTNOT2:%.*]] = and i1 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    ret i1 [[DOTNOT2]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 14) #0
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i1 @length15_eq(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i1 @length15_eq(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[DOTNOT2:%.*]] = and i1 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    ret i1 [[DOTNOT2]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) #0
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i32 @length16(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i32 @length16(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:  loadbb:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[TMP0]])
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP2]], [[LOADBB:%.*]] ], [ [[TMP11:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP3]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11]] = tail call i64 @llvm.bswap.i64(i64 [[TMP9]])
+; CHECK-NEXT:    [[TMP12]] = tail call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP6]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 16) #0
+  ret i32 %m
+}
+
+; Function Attrs: nounwind
+define i1 @length16_eq(ptr %x, ptr %y) #0 {
+; CHECK-LABEL: define i1 @length16_eq(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret i1 [[TMP3]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) #0
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+; Function Attrs: nounwind
+define i1 @length16_eq_const(ptr %X) #0 {
+; CHECK-LABEL: define i1 @length16_eq_const(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i128 [[TMP1]], 70720121592765328381466889075544961328
+; CHECK-NEXT:    ret i1 [[DOTNOT]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 16) #0
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i32 @length24(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i32 @length24(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr noundef nonnull dereferenceable(24) [[X]], ptr noundef nonnull dereferenceable(24) [[Y]], i64 24) #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 24) #0
+  ret i32 %m
+}
+
+; Function Attrs: nounwind
+define i1 @length24_eq(ptr %x, ptr %y) #0 {
+; CHECK-LABEL: define i1 @length24_eq(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP8]] to i128
+; CHECK-NEXT:    [[TMP10:%.*]] = or i128 [[TMP3]], [[TMP9]]
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i128 [[TMP10]], 0
+; CHECK-NEXT:    ret i1 [[DOTNOT]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) #0
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+; Function Attrs: nounwind
+define i1 @length24_eq_const(ptr %X) #0 {
+; CHECK-LABEL: define i1 @length24_eq_const(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 3689065127958034230
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i64 [[TMP5]] to i128
+; CHECK-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    ret i1 [[TMP8]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 24) #0
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i32 @length32(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i32 @length32(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr noundef nonnull dereferenceable(32) [[X]], ptr noundef nonnull dereferenceable(32) [[Y]], i64 32) #[[ATTR5]]
+; CHECK-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 32) #0
+  ret i32 %m
+}
+
+; Function Attrs: nounwind
+define i1 @length32_eq(ptr %x, ptr %y) #0 {
+; CHECK-LABEL: define i1 @length32_eq(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[Y]], i64 16
+; CHECK-NEXT:    [[TMP5:%.*]] = load i128, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i128 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i128 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[DOTNOT2:%.*]] = and i1 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    ret i1 [[DOTNOT2]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) #0
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+; Function Attrs: nounwind
+define i1 @length32_eq_const(ptr %X) #0 {
+; CHECK-LABEL: define i1 @length32_eq_const(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP3:%.*]] = load i128, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i128 [[TMP3]], 65382562593882267225249597816672106294
+; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    ret i1 [[TMP6]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 32) #0
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i32 @length64(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i32 @length64(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr noundef nonnull dereferenceable(64) [[X]], ptr noundef nonnull dereferenceable(64) [[Y]], i64 64) #[[ATTR5]]
+; CHECK-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 64) #0
+  ret i32 %m
+}
+
+; Function Attrs: nounwind
+define i1 @length64_eq(ptr %x, ptr %y) #0 {
+; CHECK-LABEL: define i1 @length64_eq(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr noundef nonnull dereferenceable(64) [[X]], ptr noundef nonnull dereferenceable(64) [[Y]], i64 64) #[[ATTR5]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) #0
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+; Function Attrs: nounwind
+define i1 @length64_eq_const(ptr %X) #0 {
+; CHECK-LABEL: define i1 @length64_eq_const(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr noundef nonnull dereferenceable(64) [[X]], ptr noundef nonnull dereferenceable(64) @.str, i64 64) #[[ATTR5]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 64) #0
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i32 @huge_length(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i32 @huge_length(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr noundef nonnull dereferenceable(9223372036854775807) [[X]], ptr noundef nonnull dereferenceable(9223372036854775807) [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; CHECK-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9223372036854775807) #0
+  ret i32 %m
+}
+
+; Function Attrs: nounwind
+define i1 @huge_length_eq(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i1 @huge_length_eq(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr noundef nonnull dereferenceable(9223372036854775807) [[X]], ptr noundef nonnull dereferenceable(9223372036854775807) [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9223372036854775807) #0
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i32 @nonconst_length(ptr %X, ptr %Y, i64 %size) #0 {
+; CHECK-LABEL: define i32 @nonconst_length(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]], i64 [[SIZE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; CHECK-NEXT:    ret i32 [[M]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 %size) #0
+  ret i32 %m
+}
+
+; Function Attrs: nounwind
+define i1 @nonconst_length_eq(ptr %X, ptr %Y, i64 %size) #0 {
+; CHECK-LABEL: define i1 @nonconst_length_eq(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]], i64 [[SIZE:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 %size) #0
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; Function Attrs: nounwind
+define i1 @bcmp_length2(ptr %X, ptr %Y) #0 {
+; CHECK-LABEL: define i1 @bcmp_length2(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i16 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret i1 [[DOTNOT]]
+;
+  %m = tail call i32 @bcmp(ptr %X, ptr %Y, i64 2) #0
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nobuiltin nounwind }
diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp
index b6068513d23063..3e3a40f217e63e 100644
--- a/llvm/tools/opt/opt.cpp
+++ b/llvm/tools/opt/opt.cpp
@@ -422,7 +422,6 @@ int main(int argc, char **argv) {
   // supported.
   initializeExpandLargeDivRemLegacyPassPass(Registry);
   initializeExpandLargeFpConvertLegacyPassPass(Registry);
-  initializeExpandMemCmpLegacyPassPass(Registry);
   initializeScalarizeMaskedMemIntrinLegacyPassPass(Registry);
   initializeSelectOptimizePass(Registry);
   initializeCallBrPreparePass(Registry);
diff --git a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
index 047f6583ec4e88..d618bd5bfab9aa 100644
--- a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
@@ -71,7 +71,6 @@ static_library("CodeGen") {
     "ExecutionDomainFix.cpp",
     "ExpandLargeDivRem.cpp",
     "ExpandLargeFpConvert.cpp",
-    "ExpandMemCmp.cpp",
     "ExpandPostRAPseudos.cpp",
     "ExpandReductions.cpp",
     "ExpandVectorPredication.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn
index bed26df94e2c45..876f5fece1128e 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Scalar/BUILD.gn
@@ -23,6 +23,7 @@ static_library("Scalar") {
     "DeadStoreElimination.cpp",
     "DivRemPairs.cpp",
     "EarlyCSE.cpp",
+    "ExpandMemCmp.cpp",
     "FlattenCFGPass.cpp",
     "Float2Int.cpp",
     "GVN.cpp",

>From 0907c0514d957832263ee6765d3d0b17ceae2454 Mon Sep 17 00:00:00 2001
From: Gabriel Baraldi <baraldigabriel at gmail.com>
Date: Mon, 8 Jan 2024 19:17:16 -0300
Subject: [PATCH 02/11] Whitespace fixes

---
 .../include/llvm/CodeGen/CodeGenPassBuilder.h |  1 -
 llvm/include/llvm/CodeGen/Passes.h            |  1 -
 llvm/include/llvm/InitializePasses.h          |  2 +-
 llvm/include/llvm/LinkAllPasses.h             |  2 +-
 llvm/lib/CodeGen/TargetPassConfig.cpp         |  1 -
 llvm/lib/Transforms/Scalar/ExpandMemCmp.cpp   | 38 +++++++++----------
 6 files changed, 19 insertions(+), 26 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h b/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h
index 556304231b397b..d132837b439fb3 100644
--- a/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h
@@ -628,7 +628,6 @@ void CodeGenPassBuilder<Derived>::addIRPasses(AddIRPass &addPass) const {
       addPass(PrintFunctionPass(dbgs(), "\n\n*** Code after LSR ***\n"));
   }
 
-
   // Run GC lowering passes for builtin collectors
   // TODO: add a pass insertion point here
   addPass(GCLoweringPass());
diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index e5ed5f15f62ed7..f8fae1b91314bc 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -519,7 +519,6 @@ namespace llvm {
   // Expands large div/rem instructions.
   FunctionPass *createExpandLargeFpConvertPass();
 
-
   /// Creates Break False Dependencies pass. \see BreakFalseDeps.cpp
   FunctionPass *createBreakFalseDeps();
 
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index b0ca9fa942cda3..b59ddf71743383 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -102,7 +102,7 @@ void initializeEarlyTailDuplicatePass(PassRegistry&);
 void initializeEdgeBundlesPass(PassRegistry&);
 void initializeEHContGuardCatchretPass(PassRegistry &);
 void initializeExpandLargeFpConvertLegacyPassPass(PassRegistry&);
-void initializeExpandLargeDivRemLegacyPassPass(PassRegistry&);
+void initializeExpandLargeDivRemLegacyPassPass(PassRegistry &);
 void initializeExpandPostRAPass(PassRegistry&);
 void initializeExpandReductionsPass(PassRegistry&);
 void initializeExpandVectorPredicationPass(PassRegistry &);
diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h
index 9aff428fbe938b..e414738dd325e8 100644
--- a/llvm/include/llvm/LinkAllPasses.h
+++ b/llvm/include/llvm/LinkAllPasses.h
@@ -118,7 +118,7 @@ namespace {
       (void) llvm::createGVNPass();
       (void) llvm::createPostDomTree();
       (void) llvm::createMergeICmpsLegacyPass();
-      (void) llvm::createExpandLargeDivRemPass();
+      (void)llvm::createExpandLargeDivRemPass();
       (void) llvm::createExpandVectorPredicationPass();
       std::string buf;
       llvm::raw_string_ostream os(buf);
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index 33562e90e94426..ac65bbab05a4b4 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -867,7 +867,6 @@ void TargetPassConfig::addIRPasses() {
         addPass(createPrintFunctionPass(dbgs(),
                                         "\n\n*** Code after LSR ***\n"));
     }
-
   }
 
   // Run GC lowering passes for builtin collectors
diff --git a/llvm/lib/Transforms/Scalar/ExpandMemCmp.cpp b/llvm/lib/Transforms/Scalar/ExpandMemCmp.cpp
index 973875ee142978..d2124212526adf 100644
--- a/llvm/lib/Transforms/Scalar/ExpandMemCmp.cpp
+++ b/llvm/lib/Transforms/Scalar/ExpandMemCmp.cpp
@@ -36,7 +36,6 @@
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
-
 #define DEBUG_TYPE "expand-memcmp"
 
 STATISTIC(NumMemCmpCalls, "Number of memcmp calls");
@@ -60,7 +59,6 @@ static cl::opt<unsigned> MaxLoadsPerMemcmpOptSize(
 
 namespace {
 
-
 // This class provides helper functions to expand a memcmp library call into an
 // inline expansion.
 class MemCmpExpansion {
@@ -90,8 +88,7 @@ class MemCmpExpansion {
   // 1x1-byte load, which would be represented as [{16, 0}, {16, 16}, {1, 32}.
   struct LoadEntry {
     LoadEntry(unsigned LoadSize, uint64_t Offset)
-        : LoadSize(LoadSize), Offset(Offset) {
-    }
+        : LoadSize(LoadSize), Offset(Offset) {}
 
     // The size of the load for this block, in bytes.
     unsigned LoadSize;
@@ -724,7 +721,8 @@ Value *MemCmpExpansion::getMemCmpExpansion() {
     // calculate which source was larger. The calculation requires the
     // two loaded source values of each load compare block.
     // These will be saved in the phi nodes created by setupResultBlockPHINodes.
-    if (!IsUsedForZeroCmp) setupResultBlockPHINodes();
+    if (!IsUsedForZeroCmp)
+      setupResultBlockPHINodes();
 
     // Create the number of required load compare basic blocks.
     createLoadCmpBlocks();
@@ -845,16 +843,15 @@ static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI,
   }
   const uint64_t SizeVal = SizeCast->getZExtValue();
 
-
   // TTI call to check if target would like to expand memcmp. Also, get the
   // available load sizes.
   const bool IsUsedForZeroCmp =
       IsBCmp || isOnlyUsedInZeroEqualityComparison(CI);
   bool OptForSize = CI->getFunction()->hasOptSize() ||
                     llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI);
-  auto Options = TTI->enableMemCmpExpansion(OptForSize,
-                                            IsUsedForZeroCmp);
-  if (!Options) return false;
+  auto Options = TTI->enableMemCmpExpansion(OptForSize, IsUsedForZeroCmp);
+  if (!Options)
+    return false;
   Value *Res = nullptr;
 
   if (SizeVal == 0) {
@@ -863,8 +860,7 @@ static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI,
     if (MemCmpEqZeroNumLoadsPerBlock.getNumOccurrences())
       Options.NumLoadsPerBlock = MemCmpEqZeroNumLoadsPerBlock;
 
-    if (OptForSize &&
-        MaxLoadsPerMemcmpOptSize.getNumOccurrences())
+    if (OptForSize && MaxLoadsPerMemcmpOptSize.getNumOccurrences())
       Options.MaxNumLoads = MaxLoadsPerMemcmpOptSize;
 
     if (!OptForSize && MaxLoadsPerMemcmp.getNumOccurrences())
@@ -892,20 +888,19 @@ static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI,
 
 // Returns true if a change was made.
 static bool runOnBlock(BasicBlock &BB, const TargetLibraryInfo *TLI,
-                       const TargetTransformInfo *TTI,
-                       const DataLayout &DL, ProfileSummaryInfo *PSI,
-                       BlockFrequencyInfo *BFI, DomTreeUpdater *DTU);
+                       const TargetTransformInfo *TTI, const DataLayout &DL,
+                       ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI,
+                       DomTreeUpdater *DTU);
 
 static PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI,
                                  const TargetTransformInfo *TTI,
                                  ProfileSummaryInfo *PSI,
                                  BlockFrequencyInfo *BFI, DominatorTree *DT);
 
-
 bool runOnBlock(BasicBlock &BB, const TargetLibraryInfo *TLI,
-                const TargetTransformInfo *TTI,
-                const DataLayout &DL, ProfileSummaryInfo *PSI,
-                BlockFrequencyInfo *BFI, DomTreeUpdater *DTU) {
+                const TargetTransformInfo *TTI, const DataLayout &DL,
+                ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI,
+                DomTreeUpdater *DTU) {
   for (Instruction &I : BB) {
     CallInst *CI = dyn_cast<CallInst>(&I);
     if (!CI) {
@@ -922,13 +917,14 @@ bool runOnBlock(BasicBlock &BB, const TargetLibraryInfo *TLI,
 }
 
 PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI,
-                          const TargetTransformInfo *TTI, ProfileSummaryInfo *PSI,
-                          BlockFrequencyInfo *BFI, DominatorTree *DT) {
+                          const TargetTransformInfo *TTI,
+                          ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI,
+                          DominatorTree *DT) {
   std::optional<DomTreeUpdater> DTU;
   if (DT)
     DTU.emplace(DT, DomTreeUpdater::UpdateStrategy::Lazy);
 
-  const DataLayout& DL = F.getParent()->getDataLayout();
+  const DataLayout &DL = F.getParent()->getDataLayout();
   bool MadeChanges = false;
   for (auto BBIt = F.begin(); BBIt != F.end();) {
     if (runOnBlock(*BBIt, TLI, TTI, DL, PSI, BFI, DTU ? &*DTU : nullptr)) {

>From 77d7e14f3f158c5d9a0724ebebcbcd1edbda4730 Mon Sep 17 00:00:00 2001
From: Gabriel Baraldi <baraldigabriel at gmail.com>
Date: Tue, 9 Jan 2024 17:24:36 -0300
Subject: [PATCH 03/11] Apply suggestions from code review and skip memcmp
 expansion when sanitizers are on

---
 llvm/lib/Transforms/Scalar/ExpandMemCmp.cpp   |  8 ++
 .../test/CodeGen/AArch64/dag-combine-setcc.ll | 50 +++++-----
 .../AArch64/machine-licm-hoist-load.ll        | 94 +++++++++----------
 3 files changed, 80 insertions(+), 72 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/ExpandMemCmp.cpp b/llvm/lib/Transforms/Scalar/ExpandMemCmp.cpp
index d2124212526adf..ed3843de422f00 100644
--- a/llvm/lib/Transforms/Scalar/ExpandMemCmp.cpp
+++ b/llvm/lib/Transforms/Scalar/ExpandMemCmp.cpp
@@ -920,6 +920,14 @@ PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI,
                           const TargetTransformInfo *TTI,
                           ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI,
                           DominatorTree *DT) {
+  // Sanitizers prefer that calls to memcmp remain as such
+  //so that they may be itercepted, but since the sanitizer passes run late
+  // we disable the optimization here. See maybeMarkSanitizerLibraryCallNoBuiltin
+  if (F.hasFnAttribute(Attribute::SanitizeMemory) ||
+      F.hasFnAttribute(Attribute::SanitizeAddress) ||
+      F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
+      F.hasFnAttribute(Attribute::SanitizeThread))
+    return PreservedAnalyses::all();
   std::optional<DomTreeUpdater> DTU;
   if (DT)
     DTU.emplace(DT, DomTreeUpdater::UpdateStrategy::Lazy);
diff --git a/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll b/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll
index 855a5b23f6c1cc..f58de2efb1c310 100644
--- a/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll
+++ b/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll
@@ -266,19 +266,19 @@ define i1 @combine_setcc_eq0_conjunction_xor_or(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ccmp x10, x11, #0, eq
 ; CHECK-NEXT:    cset w0, eq
 ; CHECK-NEXT:    ret
-  %1 = load i64, ptr %a, align 1
-  %2 = load i64, ptr %b, align 1
-  %3 = xor i64 %1, %2
-  %4 = getelementptr i8, ptr %a, i64 8
-  %5 = getelementptr i8, ptr %b, i64 8
-  %6 = load i64, ptr %4, align 1
-  %7 = load i64, ptr %5, align 1
-  %8 = xor i64 %6, %7
-  %9 = or i64 %3, %8
-  %10 = icmp ne i64 %9, 0
-  %11 = zext i1 %10 to i32
-  %cmp = icmp eq i32 %11, 0
-  ret i1 %cmp
+  %a.0 = load i64, ptr %a, align 1
+  %b.0 = load i64, ptr %b, align 1
+  %xor1 = xor i64 %a.0, %b.0
+  %1 = getelementptr i8, ptr %a, i64 8
+  %2 = getelementptr i8, ptr %b, i64 8
+  %a.8 = load i64, ptr %1, align 1
+  %b.8 = load i64, ptr %2, align 1
+  %xor2 = xor i64 %a.8, %b.8
+  %or = or i64 %xor1, %xor2
+  %cmp1 = icmp ne i64 %or, 0
+  %ext = zext i1 %cmp to i32
+  %cmp2 = icmp eq i32 %ext, 0
+  ret i1 %cmp2
 }
 
 define i1 @combine_setcc_ne0_conjunction_xor_or(ptr %a, ptr %b) {
@@ -290,18 +290,18 @@ define i1 @combine_setcc_ne0_conjunction_xor_or(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ccmp x10, x11, #0, eq
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
-  %1 = load i64, ptr %a, align 1
-  %2 = load i64, ptr %b, align 1
-  %3 = xor i64 %1, %2
-  %4 = getelementptr i8, ptr %a, i64 8
-  %5 = getelementptr i8, ptr %b, i64 8
-  %6 = load i64, ptr %4, align 1
-  %7 = load i64, ptr %5, align 1
-  %8 = xor i64 %6, %7
-  %9 = or i64 %3, %8
-  %10 = icmp ne i64 %9, 0
-  %11 = zext i1 %10 to i32
-  ret i1 %10
+  %a.0 = load i64, ptr %a, align 1
+  %b.0 = load i64, ptr %b, align 1
+  %xor1 = xor i64 %a.0, %b.0
+  %1 = getelementptr i8, ptr %a, i64 8
+  %2 = getelementptr i8, ptr %b, i64 8
+  %a.8 = load i64, ptr %1, align 1
+  %b.8 = load i64, ptr %2, align 1
+  %xor2 = xor i64 %a.8, %b.8
+  %or = or i64 %xor1, %xor2
+  %cmp = icmp ne i64 %or, 0
+  %ext = zext i1 %cmp to i32
+  ret i1 %ext
 }
 
 ; Doesn't increase the number of instructions, where the LHS has multiple uses
diff --git a/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll b/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll
index fc0bc1b9661163..0651027b7b9b5f 100644
--- a/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll
+++ b/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll
@@ -30,11 +30,11 @@ for.body:                                         ; preds = %for.body, %entry
   %sum.05 = phi i64 [ %spec.select, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %i.06
   %0 = load ptr, ptr %arrayidx, align 8
-  %1 = load i32, ptr %0, align 1
-  %2 = load i32, ptr %b, align 1
-  %3 = icmp ne i32 %1, %2
-  %4 = zext i1 %3 to i32
-  %tobool = icmp eq i32 %4, 0
+  %bcmp_exp = load i32, ptr %0, align 1
+  %bcmp_exp2 = load i32, ptr %b, align 1
+  %cmp = icmp ne i32 %bcmp_exp, %bcmp_exp2
+  %res = zext i1 %cmp to i32
+  %tobool = icmp eq i32 %res, 0
   %add = zext i1 %tobool to i64
   %spec.select = add i64 %sum.05, %add
   %inc = add nuw i64 %i.06, 1
@@ -94,11 +94,11 @@ for.body4:                                        ; preds = %for.body4, %for.con
   %sum.115 = phi i64 [ %sum.018, %for.cond1.preheader ], [ %spec.select, %for.body4 ]
   %arrayidx5 = getelementptr inbounds ptr, ptr %0, i64 %j.016
   %1 = load ptr, ptr %arrayidx5, align 8
-  %2 = load i32, ptr %1, align 1
-  %3 = load i32, ptr %b, align 1
-  %4 = icmp ne i32 %2, %3
-  %5 = zext i1 %4 to i32
-  %tobool = icmp eq i32 %5, 0
+  %bcmp_exp = load i32, ptr %1, align 1
+  %bcmp_exp2 = load i32, ptr %b, align 1
+  %cmp = icmp ne i32 %bcmp_exp, %bcmp_exp2
+  %res = zext i1 %cmp to i32
+  %tobool = icmp eq i32 %res, 0
   %add = zext i1 %tobool to i64
   %spec.select = add i64 %sum.115, %add
   %inc = add nuw i64 %j.016, 1
@@ -184,11 +184,11 @@ for.body8:                                        ; preds = %for.body8, %for.con
   %sum.225 = phi i64 [ %sum.128, %for.cond5.preheader ], [ %spec.select, %for.body8 ]
   %arrayidx10 = getelementptr inbounds ptr, ptr %1, i64 %k.026
   %2 = load ptr, ptr %arrayidx10, align 8
-  %3 = load i32, ptr %2, align 1
-  %4 = load i32, ptr %b, align 1
-  %5 = icmp ne i32 %3, %4
-  %6 = zext i1 %5 to i32
-  %tobool = icmp eq i32 %6, 0
+  %bcmp_exp = load i32, ptr %2, align 1
+  %bcmp_exp2 = load i32, ptr %b, align 1
+  %cmp = icmp ne i32 %bcmp_exp, %bcmp_exp2
+  %res = zext i1 %cmp to i32
+  %tobool = icmp eq i32 %res, 0
   %add = zext i1 %tobool to i64
   %spec.select = add i64 %sum.225, %add
   %inc = add nuw i64 %k.026, 1
@@ -284,11 +284,11 @@ for.body8:                                        ; preds = %for.body8, %for.con
   %sum.227 = phi i64 [ %sum.130, %for.cond5.preheader ], [ %spec.select, %for.body8 ]
   %arrayidx10 = getelementptr inbounds ptr, ptr %1, i64 %k.028
   %3 = load ptr, ptr %arrayidx10, align 8
-  %4 = load i32, ptr %3, align 1
-  %5 = load i32, ptr %2, align 1
-  %6 = icmp ne i32 %4, %5
-  %7 = zext i1 %6 to i32
-  %tobool = icmp eq i32 %7, 0
+  %bcmp_exp = load i32, ptr %3, align 1
+  %bcmp_exp2 = load i32, ptr %2, align 1
+  %cmp = icmp ne i32 %bcmp_exp, %bcmp_exp2
+  %res = zext i1 %cmp to i32
+  %tobool = icmp eq i32 %res, 0
   %add = zext i1 %tobool to i64
   %spec.select = add i64 %sum.227, %add
   %inc = add nuw i64 %k.028, 1
@@ -344,16 +344,16 @@ for.body:                                         ; preds = %for.body, %for.body
   %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
   %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %indvars.iv
   %0 = load ptr, ptr %arrayidx, align 8
-  %1 = load i32, ptr %0, align 1
-  %2 = load i32, ptr %b, align 1
-  %3 = call i32 @llvm.bswap.i32(i32 %1)
-  %4 = call i32 @llvm.bswap.i32(i32 %2)
-  %5 = icmp ugt i32 %3, %4
-  %6 = icmp ult i32 %3, %4
-  %7 = zext i1 %5 to i32
-  %8 = zext i1 %6 to i32
-  %9 = sub i32 %7, %8
-  %conv = trunc i32 %9 to i8
+  %memcmp_exp = load i32, ptr %0, align 1
+  %memcmp_exp2 = load i32, ptr %b, align 1
+  %swap = call i32 @llvm.bswap.i32(i32 %memcmp_exp)
+  %swap2 = call i32 @llvm.bswap.i32(i32 %memcmp_exp2)
+  %cmp1 = icmp ugt i32 %swap, %swap2
+  %cmp2 = icmp ult i32 %swap, %swap2
+  %ext1 = zext i1 %cmp1 to i32
+  %ext2 = zext i1 %cmp2 to i32
+  %res = sub i32 %7, %8
+  %conv = trunc i32 %res to i8
   %arrayidx2 = getelementptr inbounds i8, ptr %c, i64 %indvars.iv
   store i8 %conv, ptr %arrayidx2, align 1
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
@@ -410,11 +410,11 @@ for.body:                                         ; preds = %for.body, %for.body
   %sum.05 = phi i32 [ 0, %for.body.preheader ], [ %spec.select, %for.body ]
   %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %indvars.iv
   %0 = load ptr, ptr %arrayidx, align 8
-  %1 = load i32, ptr %0, align 1
-  %2 = load i32, ptr %b, align 1
-  %3 = icmp ne i32 %1, %2
-  %4 = zext i1 %3 to i32
-  %tobool.not = icmp eq i32 %4, 0
+  %bcmp_exp = load i32, ptr %0, align 1
+  %bcmp_exp2 = load i32, ptr %b, align 1
+  %cmp = icmp ne i32 %bcmp, %bcmp_exp2
+  %res = zext i1 %cmp to i32
+  %tobool.not = icmp eq i32 %res, 0
   %add = zext i1 %tobool.not to i32
   %spec.select = add nuw nsw i32 %sum.05, %add
   tail call void @func()
@@ -459,20 +459,20 @@ for.body:                                         ; preds = %for.body, %entry
   %sum.05 = phi i64 [ %spec.select, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %i.06
   %0 = load ptr, ptr %arrayidx, align 8
-  %1 = load i32, ptr %0, align 1
-  %2 = load i32, ptr %b, align 1
-  %3 = xor i32 %1, %2
+  %bcmp_exp = load i32, ptr %0, align 1
+  %bcmp_exp2 = load i32, ptr %b, align 1
+  %xor1 = xor i32 %bcmp_exp, %bcmp_exp2
   %4 = getelementptr i8, ptr %0, i64 4
   %5 = getelementptr i8, ptr %b, i64 4
-  %6 = load i16, ptr %4, align 1
-  %7 = load i16, ptr %5, align 1
-  %8 = zext i16 %6 to i32
-  %9 = zext i16 %7 to i32
-  %10 = xor i32 %8, %9
-  %11 = or i32 %3, %10
-  %12 = icmp ne i32 %11, 0
-  %13 = zext i1 %12 to i32
-  %tobool = icmp eq i32 %13, 0
+  %bcmp_exp3 = load i16, ptr %4, align 1
+  %bcmp_exp4 = load i16, ptr %5, align 1
+  %ext = zext i16 %6 to i32
+  %ext2 = zext i16 %7 to i32
+  %xor2 = xor i32 %ext, %ext2
+  %or = or i32 %xor1, %xor2
+  %cmp = icmp ne i32 %or, 0
+  %res = zext i1 %cmp to i32
+  %tobool = icmp eq i32 %res, 0
   %add = zext i1 %tobool to i64
   %spec.select = add i64 %sum.05, %add
   %inc = add nuw i64 %i.06, 1

>From 22dae428c22a01b39aecae943a18f646eabc9932 Mon Sep 17 00:00:00 2001
From: Gabriel Baraldi <baraldigabriel at gmail.com>
Date: Wed, 10 Jan 2024 11:13:36 -0300
Subject: [PATCH 04/11] Move pass to later in the pipeline!

---
 llvm/lib/Passes/PassBuilderPipelines.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index e2dd413f12d696..391d35e968ae1c 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -388,8 +388,6 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level,
   if (AreStatisticsEnabled())
     FPM.addPass(CountVisitsPass());
 
-  FPM.addPass(MergeICmpsPass());
-  FPM.addPass(ExpandMemCmpPass(TM));
   // Form SSA out of local memory accesses after breaking apart aggregates into
   // scalars.
   FPM.addPass(SROAPass(SROAOptions::ModifyCFG));
@@ -536,8 +534,6 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   if (AreStatisticsEnabled())
     FPM.addPass(CountVisitsPass());
 
-  FPM.addPass(MergeICmpsPass());
-  FPM.addPass(ExpandMemCmpPass(TM));
   // Form SSA out of local memory accesses after breaking apart aggregates into
   // scalars.
   FPM.addPass(SROAPass(SROAOptions::ModifyCFG));
@@ -1436,6 +1432,10 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
   // result too early.
   OptimizePM.addPass(LoopSinkPass());
 
+  // Detect anc convert memcpm like idioms to the call, and expand when profitable
+  OptimizePM.addPass(MergeICmpsPass());
+  OptimizePM.addPass(ExpandMemCmpPass(TM));
+
   // And finally clean up LCSSA form before generating code.
   OptimizePM.addPass(InstSimplifyPass());
 

>From 63a015f24bd77c8fdb55e6d1afd5fb018eee2484 Mon Sep 17 00:00:00 2001
From: Gabriel Baraldi <baraldigabriel at gmail.com>
Date: Wed, 10 Jan 2024 18:10:17 +0000
Subject: [PATCH 05/11] Apply suggestions from code review

---
 .../test/CodeGen/AArch64/dag-combine-setcc.ll |   4 +-
 .../AArch64/machine-licm-hoist-load.ll        |  16 +-
 llvm/test/Other/new-pm-defaults.ll            |   8 +-
 .../Other/new-pm-thinlto-postlink-defaults.ll |   6 +-
 .../new-pm-thinlto-postlink-pgo-defaults.ll   |   6 +-
 ...-pm-thinlto-postlink-samplepgo-defaults.ll |   6 +-
 .../Other/new-pm-thinlto-prelink-defaults.ll  |   4 +-
 .../new-pm-thinlto-prelink-pgo-defaults.ll    |  26 +-
 ...w-pm-thinlto-prelink-samplepgo-defaults.ll |   4 +-
 .../PhaseOrdering/X86/memcmp-early.ll         |  41 +-
 .../Transforms/PhaseOrdering/X86/memcmp.ll    | 403 ++++++++++--------
 11 files changed, 293 insertions(+), 231 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll b/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll
index f58de2efb1c310..f22d3acb75026f 100644
--- a/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll
+++ b/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll
@@ -276,7 +276,7 @@ define i1 @combine_setcc_eq0_conjunction_xor_or(ptr %a, ptr %b) {
   %xor2 = xor i64 %a.8, %b.8
   %or = or i64 %xor1, %xor2
   %cmp1 = icmp ne i64 %or, 0
-  %ext = zext i1 %cmp to i32
+  %ext = zext i1 %cmp1 to i32
   %cmp2 = icmp eq i32 %ext, 0
   ret i1 %cmp2
 }
@@ -301,7 +301,7 @@ define i1 @combine_setcc_ne0_conjunction_xor_or(ptr %a, ptr %b) {
   %or = or i64 %xor1, %xor2
   %cmp = icmp ne i64 %or, 0
   %ext = zext i1 %cmp to i32
-  ret i1 %ext
+  ret i1 %cmp
 }
 
 ; Doesn't increase the number of instructions, where the LHS has multiple uses
diff --git a/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll b/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll
index 0651027b7b9b5f..4c2188cf340e87 100644
--- a/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll
+++ b/llvm/test/CodeGen/AArch64/machine-licm-hoist-load.ll
@@ -352,7 +352,7 @@ for.body:                                         ; preds = %for.body, %for.body
   %cmp2 = icmp ult i32 %swap, %swap2
   %ext1 = zext i1 %cmp1 to i32
   %ext2 = zext i1 %cmp2 to i32
-  %res = sub i32 %7, %8
+  %res = sub i32 %ext1, %ext2
   %conv = trunc i32 %res to i8
   %arrayidx2 = getelementptr inbounds i8, ptr %c, i64 %indvars.iv
   store i8 %conv, ptr %arrayidx2, align 1
@@ -412,7 +412,7 @@ for.body:                                         ; preds = %for.body, %for.body
   %0 = load ptr, ptr %arrayidx, align 8
   %bcmp_exp = load i32, ptr %0, align 1
   %bcmp_exp2 = load i32, ptr %b, align 1
-  %cmp = icmp ne i32 %bcmp, %bcmp_exp2
+  %cmp = icmp ne i32 %bcmp_exp, %bcmp_exp2
   %res = zext i1 %cmp to i32
   %tobool.not = icmp eq i32 %res, 0
   %add = zext i1 %tobool.not to i32
@@ -462,12 +462,12 @@ for.body:                                         ; preds = %for.body, %entry
   %bcmp_exp = load i32, ptr %0, align 1
   %bcmp_exp2 = load i32, ptr %b, align 1
   %xor1 = xor i32 %bcmp_exp, %bcmp_exp2
-  %4 = getelementptr i8, ptr %0, i64 4
-  %5 = getelementptr i8, ptr %b, i64 4
-  %bcmp_exp3 = load i16, ptr %4, align 1
-  %bcmp_exp4 = load i16, ptr %5, align 1
-  %ext = zext i16 %6 to i32
-  %ext2 = zext i16 %7 to i32
+  %gep0 = getelementptr i8, ptr %0, i64 4
+  %gepb = getelementptr i8, ptr %b, i64 4
+  %bcmp_exp3 = load i16, ptr %gep0, align 1
+  %bcmp_exp4 = load i16, ptr %gepb, align 1
+  %ext = zext i16 %bcmp_exp3 to i32
+  %ext2 = zext i16 %bcmp_exp4 to i32
   %xor2 = xor i32 %ext, %ext2
   %or = or i32 %xor1, %xor2
   %cmp = icmp ne i32 %or, 0
diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
index ce13b2eb52a7ef..c5d70a6a0b2c33 100644
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -142,12 +142,10 @@
 ; CHECK-O2-NEXT: Running pass: OpenMPOptCGSCCPass on (foo)
 ; CHECK-O3-NEXT: Running pass: OpenMPOptCGSCCPass on (foo)
 ; CHECK-EP-CGSCC-LATE-NEXT: Running pass: NoOpCGSCCPass
-; CHECK-O-NEXT: Running pass: MergeICmpsPass on foo
-; CHECK-O-NEXT: Running analysis: AAManager on foo
-; CHECK-O-NEXT: Running pass: ExpandMemCmpPass on foo
 ; CHECK-O-NEXT: Running pass: SROAPass
 ; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running analysis: MemorySSAAnalysis
+; CHECK-O-NEXT: Running analysis: AAManager
 ; CHECK-O23SZ-NEXT: Running pass: SpeculativeExecutionPass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
@@ -269,6 +267,8 @@
 ; CHECK-O-NEXT: Running pass: LICMPass
 ; CHECK-O-NEXT: Running pass: AlignmentFromAssumptionsPass
 ; CHECK-O-NEXT: Running pass: LoopSinkPass
+; CHECK-O-NEXT: Running pass: MergeICmpsPass
+; CHECK-O-NEXT: Running pass: ExpandMemCmpPass
 ; CHECK-O-NEXT: Running pass: InstSimplifyPass
 ; CHECK-O-NEXT: Running pass: DivRemPairsPass
 ; CHECK-O-NEXT: Running pass: TailCallElimPass
@@ -315,4 +315,4 @@ loop:
   br i1 %cmp, label %exit, label %loop
 exit:
   ret void
-}
+}
\ No newline at end of file
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
index d6f09a85953c14..a31a1b069b0474 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
@@ -81,12 +81,10 @@
 ; CHECK-O3-NEXT: Running pass: ArgumentPromotionPass
 ; CHECK-O2-NEXT: Running pass: OpenMPOptCGSCCPass on (foo)
 ; CHECK-O3-NEXT: Running pass: OpenMPOptCGSCCPass on (foo)
-; CHECK-O-NEXT: Running pass: MergeICmpsPass on foo
-; CHECK-O-NEXT: Running analysis: AAManager on foo
-; CHECK-O-NEXT: Running pass: ExpandMemCmpPass on foo
 ; CHECK-O-NEXT: Running pass: SROAPass
 ; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running analysis: MemorySSAAnalysis
+; CHECK-O-NEXT: Running analysis: AAManager
 ; CHECK-O23SZ-NEXT: Running pass: SpeculativeExecutionPass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
@@ -197,6 +195,8 @@
 ; CHECK-POSTLINK-O-NEXT: Running pass: LICMPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: AlignmentFromAssumptionsPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: LoopSinkPass
+; CHECK-POSTLINK-O-NEXT: Running pass: MergeICmpsPass
+; CHECK-POSTLINK-O-NEXT: Running pass: ExpandMemCmpPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: InstSimplifyPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: DivRemPairsPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: TailCallElimPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
index cc3939c5bdcf7b..03a77c0d5bc8b6 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
@@ -69,12 +69,10 @@
 ; CHECK-O3-NEXT: Running pass: ArgumentPromotionPass
 ; CHECK-O2-NEXT: Running pass: OpenMPOptCGSCCPass
 ; CHECK-O3-NEXT: Running pass: OpenMPOptCGSCCPass
-; CHECK-O-NEXT: Running pass: MergeICmpsPass on foo
-; CHECK-O-NEXT: Running analysis: AAManager on foo
-; CHECK-O-NEXT: Running pass: ExpandMemCmpPass on foo
 ; CHECK-O-NEXT: Running pass: SROAPass
 ; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running analysis: MemorySSAAnalysis
+; CHECK-O-NEXT: Running analysis: AAManager
 ; CHECK-O23SZ-NEXT: Running pass: SpeculativeExecutionPass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
@@ -182,6 +180,8 @@
 ; CHECK-O-NEXT: Running pass: LICMPass
 ; CHECK-O-NEXT: Running pass: AlignmentFromAssumptionsPass
 ; CHECK-O-NEXT: Running pass: LoopSinkPass
+; CHECK-O-NEXT: Running pass: MergeICmpsPass
+; CHECK-O-NEXT: Running pass: ExpandMemCmpPass
 ; CHECK-O-NEXT: Running pass: InstSimplifyPass
 ; CHECK-O-NEXT: Running pass: DivRemPairsPass
 ; CHECK-O-NEXT: Running pass: TailCallElimPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
index bf354c91d15f37..b5c5f0f537e774 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
@@ -77,12 +77,10 @@
 ; CHECK-O3-NEXT: Running pass: ArgumentPromotionPass
 ; CHECK-O2-NEXT: Running pass: OpenMPOptCGSCCPass
 ; CHECK-O3-NEXT: Running pass: OpenMPOptCGSCCPass
-; CHECK-O-NEXT: Running pass: MergeICmpsPass on foo
-; CHECK-O-NEXT: Running analysis: AAManager on foo
-; CHECK-O-NEXT: Running pass: ExpandMemCmpPass on foo
 ; CHECK-O-NEXT: Running pass: SROAPass
 ; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running analysis: MemorySSAAnalysis
+; CHECK-O-NEXT: Running analysis: AAManager
 ; CHECK-O23SZ-NEXT: Running pass: SpeculativeExecutionPass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
@@ -190,6 +188,8 @@
 ; CHECK-O-NEXT: Running pass: LICMPass
 ; CHECK-O-NEXT: Running pass: AlignmentFromAssumptionsPass
 ; CHECK-O-NEXT: Running pass: LoopSinkPass
+; CHECK-O-NEXT: Running pass: MergeICmpsPass
+; CHECK-O-NEXT: Running pass: ExpandMemCmpPass
 ; CHECK-O-NEXT: Running pass: InstSimplifyPass
 ; CHECK-O-NEXT: Running pass: DivRemPairsPass
 ; CHECK-O-NEXT: Running pass: TailCallElimPass
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
index 9c5f9fd281ee7c..6486639e07b49c 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
@@ -112,12 +112,10 @@
 ; CHECK-O3-NEXT: Running pass: ArgumentPromotionPass
 ; CHECK-O2-NEXT: Running pass: OpenMPOptCGSCCPass on (foo)
 ; CHECK-O3-NEXT: Running pass: OpenMPOptCGSCCPass on (foo)
-; CHECK-O-NEXT: Running pass: MergeICmpsPass on foo
-; CHECK-O-NEXT: Running analysis: AAManager on foo
-; CHECK-O-NEXT: Running pass: ExpandMemCmpPass on foo
 ; CHECK-O-NEXT: Running pass: SROAPass
 ; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running analysis: MemorySSAAnalysis
+; CHECK-O-NEXT: Running analysis: AAManager
 ; CHECK-O23SZ-NEXT: Running pass: SpeculativeExecutionPass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
index 92ab5b6bbc74ad..09f9f0f48baddb 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
@@ -102,23 +102,17 @@
 ; CHECK-O3-NEXT: Running pass: ArgumentPromotionPass
 ; CHECK-O2-NEXT: Running pass: OpenMPOptCGSCCPass
 ; CHECK-O3-NEXT: Running pass: OpenMPOptCGSCCPass
-; CHECK-O-NEXT: Running pass: MergeICmpsPass on foo
-; CHECK-O-NEXT: Running analysis: TargetIRAnalysis on foo
-; CHECK-O-NEXT: Running analysis: AAManager on foo
-; CHECK-O-NEXT: Running analysis: BasicAA on foo
-; CHECK-O-NEXT: Running analysis: AssumptionAnalysis on foo
-; CHECK-O-NEXT: Running analysis: DominatorTreeAnalysis on foo
-; CHECK-O-NEXT: Running analysis: ScopedNoAliasAA on foo
-; CHECK-O-NEXT: Running analysis: TypeBasedAA on foo
-; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
-; CHECK-O-NEXT: Running pass: ExpandMemCmpPass on foo
-; CHECK-O-NEXT: Running analysis: BlockFrequencyAnalysis on foo
-; CHECK-O-NEXT: Running analysis: BranchProbabilityAnalysis on foo
-; CHECK-O-NEXT: Running analysis: LoopAnalysis on foo
-; CHECK-O-NEXT: Running analysis: PostDominatorTreeAnalysis on foo
 ; CHECK-O-NEXT: Running pass: SROAPass
+; CHECK-O-NEXT: Running analysis: DominatorTreeAnalysis
+; CHECK-O-NEXT: Running analysis: AssumptionAnalysis
+; CHECK-O-NEXT: Running analysis: TargetIRAnalysis
 ; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running analysis: MemorySSAAnalysis
+; CHECK-O-NEXT: Running analysis: AAManager
+; CHECK-O-NEXT: Running analysis: BasicAA
+; CHECK-O-NEXT: Running analysis: ScopedNoAliasAA
+; CHECK-O-NEXT: Running analysis: TypeBasedAA
+; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
 ; CHECK-O23SZ-NEXT: Running pass: SpeculativeExecutionPass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
@@ -126,6 +120,10 @@
 ; CHECK-O23SZ-NEXT: Invalidating analysis: LazyValueAnalysis
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
+; CHECK-O-NEXT: Running analysis: BlockFrequencyAnalysis on foo
+; CHECK-O-NEXT: Running analysis: BranchProbabilityAnalysis on foo
+; CHECK-O-NEXT: Running analysis: LoopAnalysis on foo
+; CHECK-O-NEXT: Running analysis: PostDominatorTreeAnalysis on foo
 ; CHECK-O23SZ-NEXT: Running pass: AggressiveInstCombinePass
 ; CHECK-O1-NEXT: Running pass: LibCallsShrinkWrapPass
 ; CHECK-O2-NEXT: Running pass: LibCallsShrinkWrapPass
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
index b565e80ac05e90..47bdbfd2d357d4 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
@@ -81,12 +81,10 @@
 ; CHECK-O3-NEXT: Running pass: ArgumentPromotionPass
 ; CHECK-O2-NEXT: Running pass: OpenMPOptCGSCCPass
 ; CHECK-O3-NEXT: Running pass: OpenMPOptCGSCCPass
-; CHECK-O-NEXT: Running pass: MergeICmpsPass on foo
-; CHECK-O-NEXT: Running analysis: AAManager on foo
-; CHECK-O-NEXT: Running pass: ExpandMemCmpPass on foo
 ; CHECK-O-NEXT: Running pass: SROAPass
 ; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running analysis: MemorySSAAnalysis
+; CHECK-O-NEXT: Running analysis: AAManager
 ; CHECK-O23SZ-NEXT: Running pass: SpeculativeExecutionPass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/memcmp-early.ll b/llvm/test/Transforms/PhaseOrdering/X86/memcmp-early.ll
index a62b17de08ee43..b4f7780444b25e 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/memcmp-early.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/memcmp-early.ll
@@ -1,6 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt -O2 -S -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
 
+
+; Examples of when moving memcmp expansion earlier in the pipeline are beneficial
+
 @s1 = internal global ptr @.str, align 8
 @s2 = internal global ptr @.str.1, align 8
 @s3 = internal global ptr @.str.2, align 8
@@ -19,19 +22,35 @@ define dso_local i32 @memcmp_same_prefix_consts(ptr noundef %x) #0 {
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[TMP2]], align 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP3]] to i32
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP4]], 52
-; CHECK-NEXT:    [[TMP6:%.*]] = or i32 [[TMP5]], [[TMP1]]
-; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i32 [[TMP6]], 0
-; CHECK-NEXT:    br i1 [[DOTNOT]], label [[IF_END8:%.*]], label [[IF_THEN:%.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or i32 [[TMP1]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[TMP8]], 0
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[IF_END8:%.*]], label [[IF_THEN:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i32 [[TMP4]], 33
-; CHECK-NEXT:    [[TMP8:%.*]] = or i32 [[TMP7]], [[TMP1]]
-; CHECK-NEXT:    [[DOTNOT3:%.*]] = icmp eq i32 [[TMP8]], 0
-; CHECK-NEXT:    br i1 [[DOTNOT3]], label [[IF_END8]], label [[IF_THEN3:%.*]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP9]], 858927408
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr [[TMP11]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP12]] to i32
+; CHECK-NEXT:    [[TMP14:%.*]] = xor i32 [[TMP13]], 33
+; CHECK-NEXT:    [[TMP15:%.*]] = or i32 [[TMP10]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+; CHECK-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; CHECK-NEXT:    [[CMP2_NOT:%.*]] = icmp eq i32 [[TMP17]], 0
+; CHECK-NEXT:    br i1 [[CMP2_NOT]], label [[IF_END8]], label [[IF_THEN3:%.*]]
 ; CHECK:       if.then3:
-; CHECK-NEXT:    [[TMP9:%.*]] = xor i32 [[TMP4]], 63
-; CHECK-NEXT:    [[TMP10:%.*]] = or i32 [[TMP9]], [[TMP1]]
-; CHECK-NEXT:    [[DOTNOT4:%.*]] = icmp eq i32 [[TMP10]], 0
-; CHECK-NEXT:    br i1 [[DOTNOT4]], label [[IF_END8]], label [[RETURN:%.*]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP19:%.*]] = xor i32 [[TMP18]], 858927408
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; CHECK-NEXT:    [[TMP21:%.*]] = load i8, ptr [[TMP20]], align 1
+; CHECK-NEXT:    [[TMP22:%.*]] = zext i8 [[TMP21]] to i32
+; CHECK-NEXT:    [[TMP23:%.*]] = xor i32 [[TMP22]], 63
+; CHECK-NEXT:    [[TMP24:%.*]] = or i32 [[TMP19]], [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne i32 [[TMP24]], 0
+; CHECK-NEXT:    [[TMP26:%.*]] = zext i1 [[TMP25]] to i32
+; CHECK-NEXT:    [[CMP5_NOT:%.*]] = icmp eq i32 [[TMP26]], 0
+; CHECK-NEXT:    br i1 [[CMP5_NOT]], label [[IF_END8]], label [[RETURN:%.*]]
 ; CHECK:       if.end8:
 ; CHECK-NEXT:    br label [[RETURN]]
 ; CHECK:       return:
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/memcmp.ll b/llvm/test/Transforms/PhaseOrdering/X86/memcmp.ll
index 68dfacac5b5e12..de90aec1a49c78 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/memcmp.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/memcmp.ll
@@ -42,14 +42,14 @@ define i1 @length0_lt(ptr %X, ptr %Y) #0 {
 ; Function Attrs: nounwind
 define i32 @length2(ptr %X, ptr %Y) #0 {
 ; CHECK-LABEL: define i32 @length2(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[TMP2]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
 ; CHECK-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
-; CHECK-NEXT:    [[TMP7:%.*]] = sub nsw i32 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    ret i32 [[TMP7]]
 ;
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) #0
@@ -59,11 +59,13 @@ define i32 @length2(ptr %X, ptr %Y) #0 {
 ; Function Attrs: nounwind
 define i1 @length2_eq(ptr %X, ptr %Y) #0 {
 ; CHECK-LABEL: define i1 @length2_eq(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
-; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i16 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret i1 [[DOTNOT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT:    ret i1 [[C]]
 ;
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) #0
   %c = icmp eq i32 %m, 0
@@ -73,12 +75,15 @@ define i1 @length2_eq(ptr %X, ptr %Y) #0 {
 ; Function Attrs: nounwind
 define i1 @length2_lt(ptr %X, ptr %Y) #0 {
 ; CHECK-LABEL: define i1 @length2_lt(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[TMP2]])
-; CHECK-NEXT:    [[C:%.*]] = icmp ult i16 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i32 [[TMP7]], 0
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) #0
@@ -89,12 +94,15 @@ define i1 @length2_lt(ptr %X, ptr %Y) #0 {
 ; Function Attrs: nounwind
 define i1 @length2_gt(ptr %X, ptr %Y) #0 {
 ; CHECK-LABEL: define i1 @length2_gt(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[TMP2]])
-; CHECK-NEXT:    [[C:%.*]] = icmp ugt i16 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[C:%.*]] = icmp sgt i32 [[TMP7]], 0
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) #0
@@ -105,7 +113,7 @@ define i1 @length2_gt(ptr %X, ptr %Y) #0 {
 ; Function Attrs: nounwind
 define i1 @length2_eq_const(ptr %X) #0 {
 ; CHECK-LABEL: define i1 @length2_eq_const(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
 ; CHECK-NEXT:    ret i1 [[TMP2]]
@@ -118,7 +126,7 @@ define i1 @length2_eq_const(ptr %X) #0 {
 ; Function Attrs: nounwind
 define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) #0 {
 ; CHECK-LABEL: define i1 @length2_eq_nobuiltin_attr(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 2) #[[ATTR4:[0-9]+]]
 ; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
 ; CHECK-NEXT:    ret i1 [[C]]
@@ -131,16 +139,16 @@ define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) #0 {
 ; Function Attrs: nounwind
 define i32 @length3(ptr %X, ptr %Y) #0 {
 ; CHECK-LABEL: define i32 @length3(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:  loadbb:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[X]], align 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i16 [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[TMP2]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
-; CHECK:       res_block:
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[TMP0]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[TMP1]])
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call i16 @llvm.bswap.i16(i16 [[TMP0]])
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i16 [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i16 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i16 [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 -1, i32 1
 ; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
 ; CHECK:       loadbb1:
@@ -150,7 +158,7 @@ define i32 @length3(ptr %X, ptr %Y) #0 {
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = zext i8 [[TMP9]] to i32
 ; CHECK-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
-; CHECK-NEXT:    [[TMP13:%.*]] = sub nsw i32 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sub i32 [[TMP11]], [[TMP12]]
 ; CHECK-NEXT:    br label [[ENDBLOCK]]
 ; CHECK:       endblock:
 ; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP13]], [[LOADBB1]] ], [ [[TMP6]], [[RES_BLOCK]] ]
@@ -163,19 +171,20 @@ define i32 @length3(ptr %X, ptr %Y) #0 {
 ; Function Attrs: nounwind
 define i1 @length3_eq(ptr %X, ptr %Y) #0 {
 ; CHECK-LABEL: define i1 @length3_eq(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i8 [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP8]] to i16
-; CHECK-NEXT:    [[TMP10:%.*]] = or i16 [[TMP3]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i16 [[TMP10]], 0
-; CHECK-NEXT:    ret i1 [[TMP11]]
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i16
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i16
+; CHECK-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; CHECK-NEXT:    ret i1 [[TMP12]]
 ;
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) #0
   %c = icmp ne i32 %m, 0
@@ -185,7 +194,7 @@ define i1 @length3_eq(ptr %X, ptr %Y) #0 {
 ; Function Attrs: nounwind
 define i32 @length4(ptr %X, ptr %Y) #0 {
 ; CHECK-LABEL: define i32 @length4(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[TMP1]])
@@ -193,9 +202,9 @@ define i32 @length4(ptr %X, ptr %Y) #0 {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
-; CHECK-NEXT:    [[DOTNEG:%.*]] = sext i1 [[TMP6]] to i32
-; CHECK-NEXT:    [[TMP8:%.*]] = add nsw i32 [[DOTNEG]], [[TMP7]]
-; CHECK-NEXT:    ret i32 [[TMP8]]
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; CHECK-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    ret i32 [[TMP9]]
 ;
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) #0
   ret i32 %m
@@ -204,7 +213,7 @@ define i32 @length4(ptr %X, ptr %Y) #0 {
 ; Function Attrs: nounwind
 define i1 @length4_eq(ptr %X, ptr %Y) #0 {
 ; CHECK-LABEL: define i1 @length4_eq(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
@@ -218,7 +227,7 @@ define i1 @length4_eq(ptr %X, ptr %Y) #0 {
 ; Function Attrs: nounwind
 define i1 @length4_lt(ptr %X, ptr %Y) #0 {
 ; CHECK-LABEL: define i1 @length4_lt(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[TMP1]])
@@ -234,7 +243,7 @@ define i1 @length4_lt(ptr %X, ptr %Y) #0 {
 ; Function Attrs: nounwind
 define i1 @length4_gt(ptr %X, ptr %Y) #0 {
 ; CHECK-LABEL: define i1 @length4_gt(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[TMP1]])
@@ -250,10 +259,12 @@ define i1 @length4_gt(ptr %X, ptr %Y) #0 {
 ; Function Attrs: nounwind
 define i1 @length4_eq_const(ptr %X) #0 {
 ; CHECK-LABEL: define i1 @length4_eq_const(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
-; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i32 [[TMP1]], 875770417
-; CHECK-NEXT:    ret i1 [[DOTNOT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 875770417
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; CHECK-NEXT:    ret i1 [[C]]
 ;
   %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i64 4) #0
   %c = icmp eq i32 %m, 0
@@ -263,16 +274,16 @@ define i1 @length4_eq_const(ptr %X) #0 {
 ; Function Attrs: nounwind
 define i32 @length5(ptr %X, ptr %Y) #0 {
 ; CHECK-LABEL: define i32 @length5(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:  loadbb:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[TMP2]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
-; CHECK:       res_block:
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[TMP0]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[TMP1]])
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[TMP0]])
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 -1, i32 1
 ; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
 ; CHECK:       loadbb1:
@@ -282,7 +293,7 @@ define i32 @length5(ptr %X, ptr %Y) #0 {
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = zext i8 [[TMP9]] to i32
 ; CHECK-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
-; CHECK-NEXT:    [[TMP13:%.*]] = sub nsw i32 [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sub i32 [[TMP11]], [[TMP12]]
 ; CHECK-NEXT:    br label [[ENDBLOCK]]
 ; CHECK:       endblock:
 ; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP13]], [[LOADBB1]] ], [ [[TMP6]], [[RES_BLOCK]] ]
@@ -295,19 +306,20 @@ define i32 @length5(ptr %X, ptr %Y) #0 {
 ; Function Attrs: nounwind
 define i1 @length5_eq(ptr %X, ptr %Y) #0 {
 ; CHECK-LABEL: define i1 @length5_eq(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i8 [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP8]] to i32
-; CHECK-NEXT:    [[TMP10:%.*]] = or i32 [[TMP3]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP10]], 0
-; CHECK-NEXT:    ret i1 [[TMP11]]
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i32
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i32
+; CHECK-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; CHECK-NEXT:    ret i1 [[TMP12]]
 ;
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) #0
   %c = icmp ne i32 %m, 0
@@ -317,27 +329,31 @@ define i1 @length5_eq(ptr %X, ptr %Y) #0 {
 ; Function Attrs: nounwind
 define i1 @length5_lt(ptr %X, ptr %Y) #0 {
 ; CHECK-LABEL: define i1 @length5_lt(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:  loadbb:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[TMP2]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
-; CHECK:       res_block:
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[TMP0]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[TMP1]])
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[TMP0]])
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 -1, i32 1
 ; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
 ; CHECK:       loadbb1:
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; CHECK-NEXT:    [[TMP8:%.*]] = load i8, ptr [[TMP6]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[Y]], i64 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ult i8 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i8 [[TMP9]] to i32
+; CHECK-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; CHECK-NEXT:    [[TMP13:%.*]] = sub i32 [[TMP11]], [[TMP12]]
 ; CHECK-NEXT:    br label [[ENDBLOCK]]
 ; CHECK:       endblock:
-; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i1 [ [[TMP10]], [[LOADBB1]] ], [ [[TMP5]], [[RES_BLOCK]] ]
-; CHECK-NEXT:    ret i1 [[PHI_RES]]
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP13]], [[LOADBB1]] ], [ [[TMP6]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i32 [[PHI_RES]], 0
+; CHECK-NEXT:    ret i1 [[C]]
 ;
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) #0
   %c = icmp slt i32 %m, 0
@@ -347,17 +363,18 @@ define i1 @length5_lt(ptr %X, ptr %Y) #0 {
 ; Function Attrs: nounwind
 define i1 @length7_eq(ptr %X, ptr %Y) #0 {
 ; CHECK-LABEL: define i1 @length7_eq(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 3
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[Y]], i64 3
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i32 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
-; CHECK-NEXT:    ret i1 [[TMP9]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; CHECK-NEXT:    ret i1 [[TMP10]]
 ;
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) #0
   %c = icmp ne i32 %m, 0
@@ -367,7 +384,7 @@ define i1 @length7_eq(ptr %X, ptr %Y) #0 {
 ; Function Attrs: nounwind
 define i32 @length8(ptr %X, ptr %Y) #0 {
 ; CHECK-LABEL: define i32 @length8(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[TMP1]])
@@ -375,9 +392,9 @@ define i32 @length8(ptr %X, ptr %Y) #0 {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
-; CHECK-NEXT:    [[DOTNEG:%.*]] = sext i1 [[TMP6]] to i32
-; CHECK-NEXT:    [[TMP8:%.*]] = add nsw i32 [[DOTNEG]], [[TMP7]]
-; CHECK-NEXT:    ret i32 [[TMP8]]
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; CHECK-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    ret i32 [[TMP9]]
 ;
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) #0
   ret i32 %m
@@ -386,11 +403,13 @@ define i32 @length8(ptr %X, ptr %Y) #0 {
 ; Function Attrs: nounwind
 define i1 @length8_eq(ptr %X, ptr %Y) #0 {
 ; CHECK-LABEL: define i1 @length8_eq(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
-; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret i1 [[DOTNOT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT:    ret i1 [[C]]
 ;
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) #0
   %c = icmp eq i32 %m, 0
@@ -400,7 +419,7 @@ define i1 @length8_eq(ptr %X, ptr %Y) #0 {
 ; Function Attrs: nounwind
 define i1 @length8_eq_const(ptr %X) #0 {
 ; CHECK-LABEL: define i1 @length8_eq_const(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
 ; CHECK-NEXT:    ret i1 [[TMP2]]
@@ -413,19 +432,22 @@ define i1 @length8_eq_const(ptr %X) #0 {
 ; Function Attrs: nounwind
 define i1 @length9_eq(ptr %X, ptr %Y) #0 {
 ; CHECK-LABEL: define i1 @length9_eq(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i8 [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP8]] to i64
-; CHECK-NEXT:    [[TMP10:%.*]] = or i64 [[TMP3]], [[TMP9]]
-; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[TMP10]], 0
-; CHECK-NEXT:    ret i1 [[DOTNOT]]
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i8 [[TMP7]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; CHECK-NEXT:    ret i1 [[C]]
 ;
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9) #0
   %c = icmp eq i32 %m, 0
@@ -435,19 +457,22 @@ define i1 @length9_eq(ptr %X, ptr %Y) #0 {
 ; Function Attrs: nounwind
 define i1 @length10_eq(ptr %X, ptr %Y) #0 {
 ; CHECK-LABEL: define i1 @length10_eq(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i16 [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP8]] to i64
-; CHECK-NEXT:    [[TMP10:%.*]] = or i64 [[TMP3]], [[TMP9]]
-; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[TMP10]], 0
-; CHECK-NEXT:    ret i1 [[DOTNOT]]
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP7]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP13]], 0
+; CHECK-NEXT:    ret i1 [[C]]
 ;
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 10) #0
   %c = icmp eq i32 %m, 0
@@ -457,17 +482,20 @@ define i1 @length10_eq(ptr %X, ptr %Y) #0 {
 ; Function Attrs: nounwind
 define i1 @length11_eq(ptr %X, ptr %Y) #0 {
 ; CHECK-LABEL: define i1 @length11_eq(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 3
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[Y]], i64 3
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[DOTNOT2:%.*]] = and i1 [[TMP7]], [[TMP8]]
-; CHECK-NEXT:    ret i1 [[DOTNOT2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; CHECK-NEXT:    ret i1 [[C]]
 ;
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 11) #0
   %c = icmp eq i32 %m, 0
@@ -477,19 +505,20 @@ define i1 @length11_eq(ptr %X, ptr %Y) #0 {
 ; Function Attrs: nounwind
 define i1 @length12_eq(ptr %X, ptr %Y) #0 {
 ; CHECK-LABEL: define i1 @length12_eq(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
-; CHECK-NEXT:    [[TMP10:%.*]] = or i64 [[TMP3]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i64 [[TMP10]], 0
-; CHECK-NEXT:    ret i1 [[TMP11]]
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; CHECK-NEXT:    ret i1 [[TMP12]]
 ;
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) #0
   %c = icmp ne i32 %m, 0
@@ -499,13 +528,13 @@ define i1 @length12_eq(ptr %X, ptr %Y) #0 {
 ; Function Attrs: nounwind
 define i32 @length12(ptr %X, ptr %Y) #0 {
 ; CHECK-LABEL: define i32 @length12(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:  loadbb:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[X]], align 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[Y]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[TMP0]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[TMP1]])
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
 ; CHECK:       res_block:
 ; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP2]], [[LOADBB:%.*]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
@@ -522,7 +551,7 @@ define i32 @length12(ptr %X, ptr %Y) #0 {
 ; CHECK-NEXT:    [[TMP12:%.*]] = tail call i32 @llvm.bswap.i32(i32 [[TMP10]])
 ; CHECK-NEXT:    [[TMP13]] = zext i32 [[TMP11]] to i64
 ; CHECK-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[TMP13]], [[TMP14]]
 ; CHECK-NEXT:    br i1 [[TMP15]], label [[ENDBLOCK]], label [[RES_BLOCK]]
 ; CHECK:       endblock:
 ; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP6]], [[RES_BLOCK]] ]
@@ -535,17 +564,20 @@ define i32 @length12(ptr %X, ptr %Y) #0 {
 ; Function Attrs: nounwind
 define i1 @length13_eq(ptr %X, ptr %Y) #0 {
 ; CHECK-LABEL: define i1 @length13_eq(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 5
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[Y]], i64 5
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[DOTNOT2:%.*]] = and i1 [[TMP7]], [[TMP8]]
-; CHECK-NEXT:    ret i1 [[DOTNOT2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; CHECK-NEXT:    ret i1 [[C]]
 ;
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 13) #0
   %c = icmp eq i32 %m, 0
@@ -555,17 +587,20 @@ define i1 @length13_eq(ptr %X, ptr %Y) #0 {
 ; Function Attrs: nounwind
 define i1 @length14_eq(ptr %X, ptr %Y) #0 {
 ; CHECK-LABEL: define i1 @length14_eq(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 6
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[Y]], i64 6
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[DOTNOT2:%.*]] = and i1 [[TMP7]], [[TMP8]]
-; CHECK-NEXT:    ret i1 [[DOTNOT2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; CHECK-NEXT:    ret i1 [[C]]
 ;
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 14) #0
   %c = icmp eq i32 %m, 0
@@ -575,17 +610,20 @@ define i1 @length14_eq(ptr %X, ptr %Y) #0 {
 ; Function Attrs: nounwind
 define i1 @length15_eq(ptr %X, ptr %Y) #0 {
 ; CHECK-LABEL: define i1 @length15_eq(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 7
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[Y]], i64 7
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[DOTNOT2:%.*]] = and i1 [[TMP7]], [[TMP8]]
-; CHECK-NEXT:    ret i1 [[DOTNOT2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or i64 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne i64 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP11]], 0
+; CHECK-NEXT:    ret i1 [[C]]
 ;
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) #0
   %c = icmp eq i32 %m, 0
@@ -595,13 +633,13 @@ define i1 @length15_eq(ptr %X, ptr %Y) #0 {
 ; Function Attrs: nounwind
 define i32 @length16(ptr %X, ptr %Y) #0 {
 ; CHECK-LABEL: define i32 @length16(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:  loadbb:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[X]], align 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[Y]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[TMP0]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.bswap.i64(i64 [[TMP1]])
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
 ; CHECK:       res_block:
 ; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP2]], [[LOADBB:%.*]] ], [ [[TMP11:%.*]], [[LOADBB1]] ]
@@ -616,7 +654,7 @@ define i32 @length16(ptr %X, ptr %Y) #0 {
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
 ; CHECK-NEXT:    [[TMP11]] = tail call i64 @llvm.bswap.i64(i64 [[TMP9]])
 ; CHECK-NEXT:    [[TMP12]] = tail call i64 @llvm.bswap.i64(i64 [[TMP10]])
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[TMP11]], [[TMP12]]
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[ENDBLOCK]], label [[RES_BLOCK]]
 ; CHECK:       endblock:
 ; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP6]], [[RES_BLOCK]] ]
@@ -629,7 +667,7 @@ define i32 @length16(ptr %X, ptr %Y) #0 {
 ; Function Attrs: nounwind
 define i1 @length16_eq(ptr %x, ptr %y) #0 {
 ; CHECK-LABEL: define i1 @length16_eq(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
@@ -643,10 +681,12 @@ define i1 @length16_eq(ptr %x, ptr %y) #0 {
 ; Function Attrs: nounwind
 define i1 @length16_eq_const(ptr %X) #0 {
 ; CHECK-LABEL: define i1 @length16_eq_const(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
-; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i128 [[TMP1]], 70720121592765328381466889075544961328
-; CHECK-NEXT:    ret i1 [[DOTNOT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP3]], 0
+; CHECK-NEXT:    ret i1 [[C]]
 ;
   %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 16) #0
   %c = icmp eq i32 %m, 0
@@ -657,7 +697,7 @@ define i1 @length16_eq_const(ptr %X) #0 {
 define i32 @length24(ptr %X, ptr %Y) #0 {
 ; CHECK-LABEL: define i32 @length24(
 ; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr noundef nonnull dereferenceable(24) [[X]], ptr noundef nonnull dereferenceable(24) [[Y]], i64 24) #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr noundef nonnull dereferenceable(24) [[X]], ptr noundef nonnull dereferenceable(24) [[Y]], i64 24) #[[ATTR2:[0-9]+]]
 ; CHECK-NEXT:    ret i32 [[M]]
 ;
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 24) #0
@@ -667,19 +707,22 @@ define i32 @length24(ptr %X, ptr %Y) #0 {
 ; Function Attrs: nounwind
 define i1 @length24_eq(ptr %x, ptr %y) #0 {
 ; CHECK-LABEL: define i1 @length24_eq(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP8]] to i128
-; CHECK-NEXT:    [[TMP10:%.*]] = or i128 [[TMP3]], [[TMP9]]
-; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i128 [[TMP10]], 0
-; CHECK-NEXT:    ret i1 [[DOTNOT]]
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i64 [[TMP6]] to i128
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i64 [[TMP7]] to i128
+; CHECK-NEXT:    [[TMP10:%.*]] = xor i128 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or i128 [[TMP3]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP13]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) #0
   %cmp = icmp eq i32 %call, 0
@@ -689,13 +732,13 @@ define i1 @length24_eq(ptr %x, ptr %y) #0 {
 ; Function Attrs: nounwind
 define i1 @length24_eq_const(ptr %X) #0 {
 ; CHECK-LABEL: define i1 @length24_eq_const(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 1
-; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 3689065127958034230
-; CHECK-NEXT:    [[TMP6:%.*]] = zext i64 [[TMP5]] to i128
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i64 [[TMP4]] to i128
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
 ; CHECK-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
 ; CHECK-NEXT:    ret i1 [[TMP8]]
@@ -709,7 +752,7 @@ define i1 @length24_eq_const(ptr %X) #0 {
 define i32 @length32(ptr %X, ptr %Y) #0 {
 ; CHECK-LABEL: define i32 @length32(
 ; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr noundef nonnull dereferenceable(32) [[X]], ptr noundef nonnull dereferenceable(32) [[Y]], i64 32) #[[ATTR5]]
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr noundef nonnull dereferenceable(32) [[X]], ptr noundef nonnull dereferenceable(32) [[Y]], i64 32) #[[ATTR2]]
 ; CHECK-NEXT:    ret i32 [[M]]
 ;
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 32) #0
@@ -719,17 +762,20 @@ define i32 @length32(ptr %X, ptr %Y) #0 {
 ; Function Attrs: nounwind
 define i1 @length32_eq(ptr %x, ptr %y) #0 {
 ; CHECK-LABEL: define i1 @length32_eq(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[Y]], i64 16
-; CHECK-NEXT:    [[TMP5:%.*]] = load i128, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i128 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 16
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i128, ptr [[TMP4]], align 1
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i128 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i128 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[DOTNOT2:%.*]] = and i1 [[TMP7]], [[TMP8]]
-; CHECK-NEXT:    ret i1 [[DOTNOT2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i128, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i128 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or i128 [[TMP3]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP11]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) #0
   %cmp = icmp eq i32 %call, 0
@@ -739,14 +785,15 @@ define i1 @length32_eq(ptr %x, ptr %y) #0 {
 ; Function Attrs: nounwind
 define i1 @length32_eq_const(ptr %X) #0 {
 ; CHECK-LABEL: define i1 @length32_eq_const(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[X]], i64 16
-; CHECK-NEXT:    [[TMP3:%.*]] = load i128, ptr [[TMP2]], align 1
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne i128 [[TMP1]], 70720121592765328381466889075544961328
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i128 [[TMP3]], 65382562593882267225249597816672106294
-; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    ret i1 [[TMP6]]
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i128 [[TMP1]], 70720121592765328381466889075544961328
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[X]], i64 16
+; CHECK-NEXT:    [[TMP4:%.*]] = load i128, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
+; CHECK-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    ret i1 [[TMP7]]
 ;
   %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 32) #0
   %c = icmp ne i32 %m, 0
@@ -757,7 +804,7 @@ define i1 @length32_eq_const(ptr %X) #0 {
 define i32 @length64(ptr %X, ptr %Y) #0 {
 ; CHECK-LABEL: define i32 @length64(
 ; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr noundef nonnull dereferenceable(64) [[X]], ptr noundef nonnull dereferenceable(64) [[Y]], i64 64) #[[ATTR5]]
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr noundef nonnull dereferenceable(64) [[X]], ptr noundef nonnull dereferenceable(64) [[Y]], i64 64) #[[ATTR2]]
 ; CHECK-NEXT:    ret i32 [[M]]
 ;
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 64) #0
@@ -768,7 +815,7 @@ define i32 @length64(ptr %X, ptr %Y) #0 {
 define i1 @length64_eq(ptr %x, ptr %y) #0 {
 ; CHECK-LABEL: define i1 @length64_eq(
 ; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr noundef nonnull dereferenceable(64) [[X]], ptr noundef nonnull dereferenceable(64) [[Y]], i64 64) #[[ATTR5]]
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(ptr noundef nonnull dereferenceable(64) [[X]], ptr noundef nonnull dereferenceable(64) [[Y]], i64 64) #[[ATTR2]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[CALL]], 0
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
@@ -781,7 +828,7 @@ define i1 @length64_eq(ptr %x, ptr %y) #0 {
 define i1 @length64_eq_const(ptr %X) #0 {
 ; CHECK-LABEL: define i1 @length64_eq_const(
 ; CHECK-SAME: ptr nocapture readonly [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr noundef nonnull dereferenceable(64) [[X]], ptr noundef nonnull dereferenceable(64) @.str, i64 64) #[[ATTR5]]
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr noundef nonnull dereferenceable(64) [[X]], ptr noundef nonnull dereferenceable(64) @.str, i64 64) #[[ATTR2]]
 ; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
@@ -794,7 +841,7 @@ define i1 @length64_eq_const(ptr %X) #0 {
 define i32 @huge_length(ptr %X, ptr %Y) #0 {
 ; CHECK-LABEL: define i32 @huge_length(
 ; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr noundef nonnull dereferenceable(9223372036854775807) [[X]], ptr noundef nonnull dereferenceable(9223372036854775807) [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr noundef nonnull dereferenceable(9223372036854775807) [[X]], ptr noundef nonnull dereferenceable(9223372036854775807) [[Y]], i64 9223372036854775807) #[[ATTR2]]
 ; CHECK-NEXT:    ret i32 [[M]]
 ;
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9223372036854775807) #0
@@ -805,7 +852,7 @@ define i32 @huge_length(ptr %X, ptr %Y) #0 {
 define i1 @huge_length_eq(ptr %X, ptr %Y) #0 {
 ; CHECK-LABEL: define i1 @huge_length_eq(
 ; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr noundef nonnull dereferenceable(9223372036854775807) [[X]], ptr noundef nonnull dereferenceable(9223372036854775807) [[Y]], i64 9223372036854775807) #[[ATTR5]]
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr noundef nonnull dereferenceable(9223372036854775807) [[X]], ptr noundef nonnull dereferenceable(9223372036854775807) [[Y]], i64 9223372036854775807) #[[ATTR2]]
 ; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
@@ -818,7 +865,7 @@ define i1 @huge_length_eq(ptr %X, ptr %Y) #0 {
 define i32 @nonconst_length(ptr %X, ptr %Y, i64 %size) #0 {
 ; CHECK-LABEL: define i32 @nonconst_length(
 ; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]], i64 [[SIZE:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR2]]
 ; CHECK-NEXT:    ret i32 [[M]]
 ;
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 %size) #0
@@ -829,7 +876,7 @@ define i32 @nonconst_length(ptr %X, ptr %Y, i64 %size) #0 {
 define i1 @nonconst_length_eq(ptr %X, ptr %Y, i64 %size) #0 {
 ; CHECK-LABEL: define i1 @nonconst_length_eq(
 ; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]], i64 [[SIZE:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR5]]
+; CHECK-NEXT:    [[M:%.*]] = tail call i32 @memcmp(ptr [[X]], ptr [[Y]], i64 [[SIZE]]) #[[ATTR2]]
 ; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[M]], 0
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
@@ -841,11 +888,13 @@ define i1 @nonconst_length_eq(ptr %X, ptr %Y, i64 %size) #0 {
 ; Function Attrs: nounwind
 define i1 @bcmp_length2(ptr %X, ptr %Y) #0 {
 ; CHECK-LABEL: define i1 @bcmp_length2(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
-; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i16 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret i1 [[DOTNOT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT:    ret i1 [[C]]
 ;
   %m = tail call i32 @bcmp(ptr %X, ptr %Y, i64 2) #0
   %c = icmp eq i32 %m, 0

>From e1e57398eb1d790237c5ca22c5b8493e4a8e2849 Mon Sep 17 00:00:00 2001
From: Gabriel Baraldi <baraldigabriel at gmail.com>
Date: Wed, 10 Jan 2024 18:18:56 +0000
Subject: [PATCH 06/11] Whitespace

---
 llvm/lib/Passes/PassBuilderPipelines.cpp    | 3 ++-
 llvm/lib/Transforms/Scalar/ExpandMemCmp.cpp | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 391d35e968ae1c..2955cc7c68ca58 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1432,7 +1432,8 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
   // result too early.
   OptimizePM.addPass(LoopSinkPass());
 
-  // Detect anc convert memcpm like idioms to the call, and expand when profitable
+  // Detect anc convert memcpm like idioms to the call, and expand when
+  // profitable
   OptimizePM.addPass(MergeICmpsPass());
   OptimizePM.addPass(ExpandMemCmpPass(TM));
 
diff --git a/llvm/lib/Transforms/Scalar/ExpandMemCmp.cpp b/llvm/lib/Transforms/Scalar/ExpandMemCmp.cpp
index ed3843de422f00..a8577b5229ed7c 100644
--- a/llvm/lib/Transforms/Scalar/ExpandMemCmp.cpp
+++ b/llvm/lib/Transforms/Scalar/ExpandMemCmp.cpp
@@ -921,8 +921,9 @@ PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI,
                           ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI,
                           DominatorTree *DT) {
   // Sanitizers prefer that calls to memcmp remain as such
-  //so that they may be itercepted, but since the sanitizer passes run late
-  // we disable the optimization here. See maybeMarkSanitizerLibraryCallNoBuiltin
+  // so that they may be itercepted, but since the sanitizer passes run late
+  // we disable the optimization here. See
+  // maybeMarkSanitizerLibraryCallNoBuiltin
   if (F.hasFnAttribute(Attribute::SanitizeMemory) ||
       F.hasFnAttribute(Attribute::SanitizeAddress) ||
       F.hasFnAttribute(Attribute::SanitizeHWAddress) ||

>From ba9373d16e956b463780b57b2bca0a26ac1355af Mon Sep 17 00:00:00 2001
From: Gabriel Baraldi <baraldigabriel at gmail.com>
Date: Tue, 16 Jan 2024 11:09:23 -0300
Subject: [PATCH 07/11] Small Fixes

---
 llvm/lib/Passes/PassBuilderPipelines.cpp           | 14 +++++++++-----
 llvm/test/Other/new-pm-defaults.ll                 |  6 +++---
 .../test/Other/new-pm-thinlto-postlink-defaults.ll |  4 ++--
 .../Other/new-pm-thinlto-postlink-pgo-defaults.ll  |  4 ++--
 .../new-pm-thinlto-postlink-samplepgo-defaults.ll  |  4 ++--
 .../Transforms/PhaseOrdering/PowerPC/lit.local.cfg |  2 +-
 6 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 2955cc7c68ca58..fec23d20cbb68c 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1432,19 +1432,19 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
   // result too early.
   OptimizePM.addPass(LoopSinkPass());
 
-  // Detect anc convert memcpm like idioms to the call, and expand when
-  // profitable
-  OptimizePM.addPass(MergeICmpsPass());
-  OptimizePM.addPass(ExpandMemCmpPass(TM));
-
   // And finally clean up LCSSA form before generating code.
   OptimizePM.addPass(InstSimplifyPass());
 
+
   // This hoists/decomposes div/rem ops. It should run after other sink/hoist
   // passes to avoid re-sinking, but before SimplifyCFG because it can allow
   // flattening of blocks.
   OptimizePM.addPass(DivRemPairsPass());
 
+  // Detect and convert memcmp like idioms to the call then expand them if profitable
+  OptimizePM.addPass(MergeICmpsPass());
+  OptimizePM.addPass(ExpandMemCmpPass(TM));
+  
   // Try to annotate calls that were created during optimization.
   OptimizePM.addPass(TailCallElimPass());
 
@@ -1963,6 +1963,10 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
   // flattening of blocks.
   LateFPM.addPass(DivRemPairsPass());
 
+  // Detect and convert memcmp like idioms to the call then expand them if profitable
+  OptimizePM.addPass(MergeICmpsPass());
+  OptimizePM.addPass(ExpandMemCmpPass(TM));
+
   // Delete basic blocks, which optimization passes may have killed.
   LateFPM.addPass(SimplifyCFGPass(
       SimplifyCFGOptions().convertSwitchRangeToICmp(true).hoistCommonInsts(
diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
index c5d70a6a0b2c33..26c8ce5fe9e5a4 100644
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -267,10 +267,10 @@
 ; CHECK-O-NEXT: Running pass: LICMPass
 ; CHECK-O-NEXT: Running pass: AlignmentFromAssumptionsPass
 ; CHECK-O-NEXT: Running pass: LoopSinkPass
-; CHECK-O-NEXT: Running pass: MergeICmpsPass
-; CHECK-O-NEXT: Running pass: ExpandMemCmpPass
 ; CHECK-O-NEXT: Running pass: InstSimplifyPass
 ; CHECK-O-NEXT: Running pass: DivRemPairsPass
+; CHECK-O-NEXT: Running pass: MergeICmpsPass
+; CHECK-O-NEXT: Running pass: ExpandMemCmpPass
 ; CHECK-O-NEXT: Running pass: TailCallElimPass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-EP-OPTIMIZER-LAST: Running pass: NoOpModulePass
@@ -315,4 +315,4 @@ loop:
   br i1 %cmp, label %exit, label %loop
 exit:
   ret void
-}
\ No newline at end of file
+}
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
index a31a1b069b0474..be4bd7dd226842 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
@@ -195,10 +195,10 @@
 ; CHECK-POSTLINK-O-NEXT: Running pass: LICMPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: AlignmentFromAssumptionsPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: LoopSinkPass
-; CHECK-POSTLINK-O-NEXT: Running pass: MergeICmpsPass
-; CHECK-POSTLINK-O-NEXT: Running pass: ExpandMemCmpPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: InstSimplifyPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: DivRemPairsPass
+; CHECK-POSTLINK-O-NEXT: Running pass: MergeICmpsPass
+; CHECK-POSTLINK-O-NEXT: Running pass: ExpandMemCmpPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: TailCallElimPass
 ; CHECK-POSTLINK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-POST-EP-OPT-LAST-NEXT: Running pass: NoOpModulePass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
index 03a77c0d5bc8b6..a77013809ccf0e 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
@@ -180,10 +180,10 @@
 ; CHECK-O-NEXT: Running pass: LICMPass
 ; CHECK-O-NEXT: Running pass: AlignmentFromAssumptionsPass
 ; CHECK-O-NEXT: Running pass: LoopSinkPass
-; CHECK-O-NEXT: Running pass: MergeICmpsPass
-; CHECK-O-NEXT: Running pass: ExpandMemCmpPass
 ; CHECK-O-NEXT: Running pass: InstSimplifyPass
 ; CHECK-O-NEXT: Running pass: DivRemPairsPass
+; CHECK-O-NEXT: Running pass: MergeICmpsPass
+; CHECK-O-NEXT: Running pass: ExpandMemCmpPass
 ; CHECK-O-NEXT: Running pass: TailCallElimPass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: GlobalDCEPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
index b5c5f0f537e774..b2c6464108d4d5 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
@@ -188,10 +188,10 @@
 ; CHECK-O-NEXT: Running pass: LICMPass
 ; CHECK-O-NEXT: Running pass: AlignmentFromAssumptionsPass
 ; CHECK-O-NEXT: Running pass: LoopSinkPass
-; CHECK-O-NEXT: Running pass: MergeICmpsPass
-; CHECK-O-NEXT: Running pass: ExpandMemCmpPass
 ; CHECK-O-NEXT: Running pass: InstSimplifyPass
 ; CHECK-O-NEXT: Running pass: DivRemPairsPass
+; CHECK-O-NEXT: Running pass: MergeICmpsPass
+; CHECK-O-NEXT: Running pass: ExpandMemCmpPass
 ; CHECK-O-NEXT: Running pass: TailCallElimPass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: GlobalDCEPass
diff --git a/llvm/test/Transforms/PhaseOrdering/PowerPC/lit.local.cfg b/llvm/test/Transforms/PhaseOrdering/PowerPC/lit.local.cfg
index dfb347e640e144..091332439b1867 100644
--- a/llvm/test/Transforms/PhaseOrdering/PowerPC/lit.local.cfg
+++ b/llvm/test/Transforms/PhaseOrdering/PowerPC/lit.local.cfg
@@ -1,2 +1,2 @@
 if not 'PowerPC' in config.root.targets:
-    config.unsupported = True
\ No newline at end of file
+    config.unsupported = True

>From 8d670ef777478e26b0c42b7ce7d12bf4eea7b457 Mon Sep 17 00:00:00 2001
From: Gabriel Baraldi <baraldigabriel at gmail.com>
Date: Tue, 16 Jan 2024 11:33:22 -0300
Subject: [PATCH 08/11] Remove unused TM member

---
 llvm/include/llvm/Transforms/Scalar/ExpandMemCmp.h | 3 +--
 llvm/lib/Passes/PassBuilderPipelines.cpp           | 6 +++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Scalar/ExpandMemCmp.h b/llvm/include/llvm/Transforms/Scalar/ExpandMemCmp.h
index 94ba0cf9305040..3b5d3cab0d80ef 100644
--- a/llvm/include/llvm/Transforms/Scalar/ExpandMemCmp.h
+++ b/llvm/include/llvm/Transforms/Scalar/ExpandMemCmp.h
@@ -16,10 +16,9 @@ namespace llvm {
 class TargetMachine;
 
 class ExpandMemCmpPass : public PassInfoMixin<ExpandMemCmpPass> {
-  const TargetMachine *TM;
 
 public:
-  explicit ExpandMemCmpPass(const TargetMachine *TM_) : TM(TM_) {}
+  explicit ExpandMemCmpPass() {}
 
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
 };
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index fec23d20cbb68c..ddede72f721556 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1443,8 +1443,8 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
 
   // Detect and convert memcmp like idioms to the call then expand them if profitable
   OptimizePM.addPass(MergeICmpsPass());
-  OptimizePM.addPass(ExpandMemCmpPass(TM));
-  
+  OptimizePM.addPass(ExpandMemCmpPass());
+
   // Try to annotate calls that were created during optimization.
   OptimizePM.addPass(TailCallElimPass());
 
@@ -1965,7 +1965,7 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
 
   // Detect and convert memcmp like idioms to the call then expand them if profitable
   OptimizePM.addPass(MergeICmpsPass());
-  OptimizePM.addPass(ExpandMemCmpPass(TM));
+  OptimizePM.addPass(ExpandMemCmpPass());
 
   // Delete basic blocks, which optimization passes may have killed.
   LateFPM.addPass(SimplifyCFGPass(

>From 1a340058285eda1f8c4498bd70bb81ed181eb79f Mon Sep 17 00:00:00 2001
From: Gabriel Baraldi <baraldigabriel at gmail.com>
Date: Tue, 16 Jan 2024 11:54:19 -0300
Subject: [PATCH 09/11] Actually remote the TM from everywhere

---
 llvm/lib/Passes/PassBuilderPipelines.cpp | 4 ++--
 llvm/lib/Passes/PassRegistry.def         | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index ddede72f721556..7f1ded591d6a29 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1964,8 +1964,8 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
   LateFPM.addPass(DivRemPairsPass());
 
   // Detect and convert memcmp like idioms to the call then expand them if profitable
-  OptimizePM.addPass(MergeICmpsPass());
-  OptimizePM.addPass(ExpandMemCmpPass());
+  LateFPM.addPass(MergeICmpsPass());
+  LateFPM.addPass(ExpandMemCmpPass());
 
   // Delete basic blocks, which optimization passes may have killed.
   LateFPM.addPass(SimplifyCFGPass(
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 3acb72912709f1..efb361518dc2e6 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -314,7 +314,6 @@ FUNCTION_PASS("dse", DSEPass())
 FUNCTION_PASS("dwarf-eh-prepare", DwarfEHPreparePass(TM))
 FUNCTION_PASS("expand-large-div-rem", ExpandLargeDivRemPass(TM))
 FUNCTION_PASS("expand-large-fp-convert", ExpandLargeFpConvertPass(TM))
-FUNCTION_PASS("expand-memcmp", ExpandMemCmpPass(TM))
 FUNCTION_PASS("fix-irreducible", FixIrreduciblePass())
 FUNCTION_PASS("flattencfg", FlattenCFGPass())
 FUNCTION_PASS("float2int", Float2IntPass())
@@ -359,7 +358,7 @@ FUNCTION_PASS("mem2reg", PromotePass())
 FUNCTION_PASS("memcpyopt", MemCpyOptPass())
 FUNCTION_PASS("memprof", MemProfilerPass())
 FUNCTION_PASS("mergeicmps", MergeICmpsPass())
-FUNCTION_PASS("expand-memcmp", ExpandMemCmpPass(TM))
+FUNCTION_PASS("expand-memcmp", ExpandMemCmpPass())
 FUNCTION_PASS("mergereturn", UnifyFunctionExitNodesPass())
 FUNCTION_PASS("move-auto-init", MoveAutoInitPass())
 FUNCTION_PASS("nary-reassociate", NaryReassociatePass())

>From 499ffeb175adf4fce2cd7a5b33a37ca171b1f3cb Mon Sep 17 00:00:00 2001
From: Gabriel Baraldi <baraldigabriel at gmail.com>
Date: Wed, 17 Jan 2024 17:45:42 +0000
Subject: [PATCH 10/11] Update tests that had a slight change

---
 llvm/test/Other/new-pm-lto-defaults.ll           |  2 ++
 llvm/test/Transforms/PhaseOrdering/X86/memcmp.ll | 10 ++++++++++
 2 files changed, 12 insertions(+)

diff --git a/llvm/test/Other/new-pm-lto-defaults.ll b/llvm/test/Other/new-pm-lto-defaults.ll
index d451d2897f673c..1691e186fb862e 100644
--- a/llvm/test/Other/new-pm-lto-defaults.ll
+++ b/llvm/test/Other/new-pm-lto-defaults.ll
@@ -141,6 +141,8 @@
 ; CHECK-O-NEXT: Running pass: LowerTypeTestsPass
 ; CHECK-O23SZ-NEXT: Running pass: LoopSink
 ; CHECK-O23SZ-NEXT: Running pass: DivRemPairs
+; CHECK-O23SZ-NEXT: Running pass: MergeICmpsPass
+; CHECK-O23SZ-NEXT: Running pass: ExpandMemCmpPass
 ; CHECK-O23SZ-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O23SZ-NEXT: Running pass: EliminateAvailableExternallyPass
 ; CHECK-O23SZ-NEXT: Running pass: GlobalDCEPass
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/memcmp.ll b/llvm/test/Transforms/PhaseOrdering/X86/memcmp.ll
index de90aec1a49c78..a9dbf5cf4b58e6 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/memcmp.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/memcmp.ll
@@ -116,6 +116,7 @@ define i1 @length2_eq_const(ptr %X) #0 {
 ; CHECK-SAME: ptr nocapture readonly [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 12849
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
 ; CHECK-NEXT:    ret i1 [[TMP2]]
 ;
   %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([65 x i8], ptr @.str, i32 0, i32 1), i64 2) #0
@@ -184,6 +185,7 @@ define i1 @length3_eq(ptr %X, ptr %Y) #0 {
 ; CHECK-NEXT:    [[TMP10:%.*]] = xor i16 [[TMP8]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = or i16 [[TMP3]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i16 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
 ; CHECK-NEXT:    ret i1 [[TMP12]]
 ;
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) #0
@@ -217,6 +219,7 @@ define i1 @length4_eq(ptr %X, ptr %Y) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
 ; CHECK-NEXT:    ret i1 [[TMP3]]
 ;
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) #0
@@ -319,6 +322,7 @@ define i1 @length5_eq(ptr %X, ptr %Y) #0 {
 ; CHECK-NEXT:    [[TMP10:%.*]] = xor i32 [[TMP8]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = or i32 [[TMP3]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
 ; CHECK-NEXT:    ret i1 [[TMP12]]
 ;
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) #0
@@ -374,6 +378,7 @@ define i1 @length7_eq(ptr %X, ptr %Y) #0 {
 ; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or i32 [[TMP3]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i1 [[TMP10]] to i32
 ; CHECK-NEXT:    ret i1 [[TMP10]]
 ;
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) #0
@@ -422,6 +427,7 @@ define i1 @length8_eq_const(ptr %X) #0 {
 ; CHECK-SAME: ptr nocapture readonly [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[TMP1]], 3978425819141910832
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i1 [[TMP2]] to i32
 ; CHECK-NEXT:    ret i1 [[TMP2]]
 ;
   %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 8) #0
@@ -518,6 +524,7 @@ define i1 @length12_eq(ptr %X, ptr %Y) #0 {
 ; CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP8]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP3]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i64 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = zext i1 [[TMP12]] to i32
 ; CHECK-NEXT:    ret i1 [[TMP12]]
 ;
   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) #0
@@ -671,6 +678,7 @@ define i1 @length16_eq(ptr %x, ptr %y) #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i128, ptr [[X]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i128, ptr [[Y]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i128 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
 ; CHECK-NEXT:    ret i1 [[TMP3]]
 ;
   %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) #0
@@ -741,6 +749,7 @@ define i1 @length24_eq_const(ptr %X) #0 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i128 [[TMP5]], 3689065127958034230
 ; CHECK-NEXT:    [[TMP7:%.*]] = or i128 [[TMP2]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
 ; CHECK-NEXT:    ret i1 [[TMP8]]
 ;
   %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 24) #0
@@ -793,6 +802,7 @@ define i1 @length32_eq_const(ptr %X) #0 {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i128 [[TMP4]], 65382562593882267225249597816672106294
 ; CHECK-NEXT:    [[TMP6:%.*]] = or i128 [[TMP2]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP7]] to i32
 ; CHECK-NEXT:    ret i1 [[TMP7]]
 ;
   %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 32) #0

>From 150e6b0093aa368475cd077958e4d85a3b811a35 Mon Sep 17 00:00:00 2001
From: Gabriel Baraldi <baraldigabriel at gmail.com>
Date: Wed, 17 Jan 2024 17:58:01 +0000
Subject: [PATCH 11/11] formatting change

---
 llvm/lib/Passes/PassBuilderPipelines.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 7f1ded591d6a29..a129160671da51 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1435,13 +1435,13 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
   // And finally clean up LCSSA form before generating code.
   OptimizePM.addPass(InstSimplifyPass());
 
-
   // This hoists/decomposes div/rem ops. It should run after other sink/hoist
   // passes to avoid re-sinking, but before SimplifyCFG because it can allow
   // flattening of blocks.
   OptimizePM.addPass(DivRemPairsPass());
 
-  // Detect and convert memcmp like idioms to the call then expand them if profitable
+  // Detect and convert memcmp like idioms to the call then expand them if
+  // profitable
   OptimizePM.addPass(MergeICmpsPass());
   OptimizePM.addPass(ExpandMemCmpPass());
 
@@ -1963,7 +1963,8 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
   // flattening of blocks.
   LateFPM.addPass(DivRemPairsPass());
 
-  // Detect and convert memcmp like idioms to the call then expand them if profitable
+  // Detect and convert memcmp like idioms to the call then expand them if
+  // profitable
   LateFPM.addPass(MergeICmpsPass());
   LateFPM.addPass(ExpandMemCmpPass());
 



More information about the cfe-commits mailing list